options(warn=2)
options(scipen = 999)
setwd("C:/Users/Alexi/OneDrive - United Nations/_R/aves/merged-11-April-2020")
library("estimatr")
library("car")
library("stringi")
library("stringr")
library("dplyr")
library("data.table")
library("readxl")
files <- dir(path = ".", pattern = "\\.csv$")
hs_detailed <- data.frame(hs6 = stringr::str_split_fixed(files, "[.]", 2)[,1])
hs_detailed$hs6 <- as.character(hs_detailed$hs6)
hs_detailed$HS2 <- as.numeric(substr(hs_detailed$hs6,1,2))
hs_detailed$Section <- hs_detailed$HS2
hs_detailed$Section[hs_detailed$HS2 < 6] <- 1
hs_detailed$Section[(hs_detailed$HS2 >= 6) & (hs_detailed$HS2 < 15)] <- 2
hs_detailed$Section[hs_detailed$HS2 == 15] <- 3
hs_detailed$Section[(hs_detailed$HS2 >= 16) & (hs_detailed$HS2 < 25)] <- 4
hs_detailed$Section[(hs_detailed$HS2 >= 25) & (hs_detailed$HS2 < 28)] <- 5
hs_detailed$Section[(hs_detailed$HS2 >= 28) & (hs_detailed$HS2 < 39)] <- 6
hs_detailed$Section[(hs_detailed$HS2 >= 39) & (hs_detailed$HS2 < 41)] <- 7
hs_detailed$Section[(hs_detailed$HS2 >= 41) & (hs_detailed$HS2 < 44)] <- 8
hs_detailed$Section[(hs_detailed$HS2 >= 44) & (hs_detailed$HS2 < 47)] <- 9
hs_detailed$Section[(hs_detailed$HS2 >= 47) & (hs_detailed$HS2 < 50)] <- 10
hs_detailed$Section[(hs_detailed$HS2 >= 50) & (hs_detailed$HS2 < 64)] <- 11
hs_detailed$Section[(hs_detailed$HS2 >= 64) & (hs_detailed$HS2 < 68)] <- 12
hs_detailed$Section[(hs_detailed$HS2 >= 68) & (hs_detailed$HS2 < 71)] <- 13
hs_detailed$Section[hs_detailed$HS2 == 71] <- 14
hs_detailed$Section[(hs_detailed$HS2 >= 72) & (hs_detailed$HS2 < 84)] <- 15
hs_detailed$Section[(hs_detailed$HS2 >= 84) & (hs_detailed$HS2 < 86)] <- 16
hs_detailed$Section[(hs_detailed$HS2 >= 86) & (hs_detailed$HS2 < 90)] <- 17
hs_detailed$Section[(hs_detailed$HS2 >= 90) & (hs_detailed$HS2 < 93)] <- 18
hs_detailed$Section[hs_detailed$HS2 == 93] <- 19
hs_detailed$Section[(hs_detailed$HS2 >= 94) & (hs_detailed$HS2 < 97)] <- 20
hs_detailed$Section[hs_detailed$HS2 == 97] <- 21
com_con <- read.csv("../GTAP/HS12-gtap.csv", stringsAsFactors = FALSE)
com_con$hs6 <-  stri_pad_left(com_con$hs6,6,0)
hs_detailed <- merge(hs_detailed, com_con, by="hs6", all.x = TRUE)
EUN <- c('AUT', 'BEL', 'BGR', 'HRV', 'CYP', 'CZE', 'DNK', 'EST', 'FIN', 'FRA', 'DEU', 'GRC', 'HUN', 'IRL', 'ITA', 'LVA', 'LTU', 'LUX', 'MLT', 'NLD', 'POL', 'PRT', 'ROM', 'SVK', 'SVN', 'ESP', 'SWE', 'GBR')
sections<-unique(hs_detailed$Section) # this could also be Section, HS2
hs_beta_list <-  vector("list", nrow(hs_detailed))
names(hs_beta_list) <-   hs_detailed$hs6
hs_beta_list_nt <-  vector("list", nrow(hs_detailed))
names(hs_beta_list_nt) <-   hs_detailed$hs6
results_summary <- data.frame(matrix(vector(),0,12,
dimnames = list(c(),
c("section",
"ntm_count_t_b",
"ntm_count_nt_b",
"ntm_count_t_p",
"ntm_count_nt_p",
"ntm_count_t_int",
"ntm_count_t_int_sign",
"ntm_count_nt_int",
"ntm_count_nt_int_sign",
"n",
"r",
"adj")
)),
stringsAsFactors=F)
s <- 1 ;f <- 1
n <- 1; both = TRUE
merged5 <- data.frame()
subsetx <- hs_detailed[hs_detailed$Section==sections[s],]
for(f in 1:nrow(subsetx))
{
hs_code <- subsetx[f,"hs6"]
cat( "--- hs code: ",hs_code," ---\n");flush.console()
merged4 <- read.csv(paste0(hs_code,".csv"), stringsAsFactors = FALSE)
#remove Inf or NA prices
merged4 <- merged4[!is.na(merged4$price)&merged4$price<Inf,]
#truncate top and bottom n of prices
#
merged4 <- merged4[merged4$price<quantile(merged4$price,prob=1-n/100),]
if(both==TRUE){
merged4 <- merged4[merged4$price>quantile(merged4$price,prob=n/100),]
}
if(nrow(merged4)>0){
if(nrow(merged5)==0){
merged5 <- merged4
}else{
nextrow = nrow(merged5)+1
merged5[nextrow:(nextrow+nrow(merged4)-1),] = merged4
# we need to assure unique row names
row.names(merged5) = 1:nrow(merged5)
}
}
}
cat( "---SECTION: ",s," : RUNNING REGRESSION---\n");flush.console()
merged5$ntm_count_t <- merged5$A + merged5$B + merged5$C
merged5$ntm_count_nt <- merged5$E + merged5$F + merged5$G + merged5$H + merged5$I #+ merged5$J + merged5$L + merged5$N  + merged5$O
reg1 <- lm_robust(log(price) ~
log(dist) + contig + comlang_off + rta + llocked +
applied + ntm_count_t +
ntm_count_nt +
factor(ProductCode)+
ReporterISO3 +
PartnerISO3
+ ReporterISO3*ntm_count_t
+ ReporterISO3*ntm_count_nt
,
cluster = dist, data = merged5, se_type = "stata")
library("estimatr")
options(warn=1)
library("estimatr")
reg1 <- lm_robust(log(price) ~
log(dist) + contig + comlang_off + rta + llocked +
applied + ntm_count_t +
ntm_count_nt +
factor(ProductCode)+
ReporterISO3 +
PartnerISO3
+ ReporterISO3*ntm_count_t
+ ReporterISO3*ntm_count_nt
,
cluster = dist, data = merged5, se_type = "stata")
summary_results <-  data.frame(summary(reg1)$coefficients[,c("Estimate","Pr(>|t|)")])
write.csv(summary_results, "../summary_results.csv")
write.csv(merged5[merged5$ReporterISO3=="CRI",], "../Reporter_BFA.csv")
rownames_1 <- row.names(summary(reg1)$coefficients)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("[:]", rownames_1)]  )
summary_results <- summary_results[rownames_1,]
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("ntm_count_t", rownames_1)]  )
summary_results_t <- data.frame(summary_results[rownames_1, ])
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("ntm_count_nt", rownames_1)]  )
summary_results_nt <- data.frame(summary_results[rownames_1, ])
# format the list to store and then merge later on
summary_results_t_for_list <- summary_results_t
summary_results_t_for_list$reporter <- rownames(summary_results_t_for_list)
summary_results_t_for_list <- summary_results_t_for_list[!is.na(summary_results_t_for_list$Estimate),]
summary_results_t_for_list$reporter <- str_sub(summary_results_t_for_list$reporter,-3,-1)
rownames(summary_results_t_for_list) <- 1:nrow(summary_results_t_for_list)
summary_results_t_for_list <- summary_results_t_for_list %>% select(reporter, Estimate, `Pr...t..`) %>% rename(p=`Pr...t..`, beta_interaction = Estimate)
summary_results_nt_for_list <- summary_results_nt
summary_results_nt_for_list <- summary_results_nt_for_list[!is.na(summary_results_nt_for_list$Estimate),]
summary_results_nt_for_list$reporter <- rownames(summary_results_nt_for_list)
summary_results_nt_for_list$reporter <- str_sub(summary_results_nt_for_list$reporter,-3,-1)
rownames(summary_results_nt_for_list) <- 1:nrow(summary_results_nt_for_list)
summary_results_nt_for_list <- summary_results_nt_for_list %>% select(reporter, Estimate, `Pr...t..`) %>% rename(p=`Pr...t..`, beta_interaction = Estimate)
for(j in 1:nrow(subsetx))
{
hs_codej <- subsetx[j,"hs6"]
#cat( "--- Addint to the list of interactions ---\n");flush.console()
hs_beta_list[[hs_codej]] <- summary_results_t_for_list
hs_beta_list_nt[[hs_codej]] <- summary_results_nt_for_list
}
rm(summary_results_t_for_list, j)
#attributes(summary(reg1))
results_summary[s,"section" ] <- sections[s]
results_summary[s,"ntm_count_t_b" ] <- summary(reg1)$coefficients[c("ntm_count_t"),"Estimate"]
results_summary[s,"ntm_count_t_p" ] <- summary(reg1)$coefficients[c("ntm_count_t"),"Pr(>|t|)"]
summary_results <-  data.frame(summary(reg1)$coefficients[,c("Estimate","Pr(>|t|)")])
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  "ntm_count_nt")
if (length(rownames_1)>0){
results_summary[s,"ntm_count_nt_b" ] <- summary(reg1)$coefficients[c("ntm_count_nt"),"Estimate"]
results_summary[s,"ntm_count_nt_p" ] <- summary(reg1)$coefficients[c("ntm_count_nt"),"Pr(>|t|)"]
}
results_summary[s,"ntm_count_t_int" ] <- nrow(summary_results_t)
results_summary[s,"ntm_count_t_int_sign" ] <- nrow(summary_results_t[summary_results_t[,2]<0.1&!is.na(summary_results_t[,2]),])
results_summary[s,"ntm_count_nt_int" ] <- nrow(summary_results_nt)
results_summary[s,"ntm_count_nt_int_sign" ] <- nrow(summary_results_nt[summary_results_nt[,2]<0.1&!is.na(summary_results_nt[,2]),])
results_summary[s,"n" ] <- nrow(merged5)
results_summary[s,"r" ] <- summary(reg1)$r.squared
results_summary[s,"adj" ] <- summary(reg1)$adj.r.squared
results_summary
hs_beta_list_nt
View(hs_beta_list)
summary_results_t_for_list
head(merged5)
rownames_1 <- row.names(summary(reg1)$coefficients)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("[:]", rownames_1)]  )
summary_results <- summary_results[rownames_1,]
summary_results
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("ntm_count_t", rownames_1)]  )
summary_results_t <- data.frame(summary_results[rownames_1, ])
summary_results_t
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("ntm_count_nt", rownames_1)]  )
summary_results_nt <- data.frame(summary_results[rownames_1, ])
# format the list to store and then merge later on
summary_results_t_for_list <- summary_results_t
summary_results_t_for_list <- summary_results_t_for_list[!is.na(summary_results_t_for_list$Estimate),]
summary_results_t_for_list$reporter <- rownames(summary_results_t_for_list)
summary_results_t_for_list$reporter <- str_sub(summary_results_t_for_list$reporter,-3,-1)
rownames(summary_results_t_for_list) <- 1:nrow(summary_results_t_for_list)
summary_results_t_for_list <- summary_results_t_for_list %>% select(reporter, Estimate, `Pr...t..`) %>% rename(p=`Pr...t..`, beta_interaction = Estimate)
summary_results_t_for_list
for(s in 1:length(sections)){
merged5 <- data.frame()
subsetx <- hs_detailed[hs_detailed$Section==sections[s],]
for(f in 1:nrow(subsetx))
{
hs_code <- subsetx[f,"hs6"]
cat( "--- hs code: ",hs_code," ---\n");flush.console()
merged4 <- read.csv(paste0(hs_code,".csv"), stringsAsFactors = FALSE)
#remove Inf or NA prices
merged4 <- merged4[!is.na(merged4$price)&merged4$price<Inf,]
#truncate top and bottom n of prices
#
merged4 <- merged4[merged4$price<quantile(merged4$price,prob=1-n/100),]
if(both==TRUE){
merged4 <- merged4[merged4$price>quantile(merged4$price,prob=n/100),]
}
if(nrow(merged4)>0){
if(nrow(merged5)==0){
merged5 <- merged4
}else{
nextrow = nrow(merged5)+1
merged5[nextrow:(nextrow+nrow(merged4)-1),] = merged4
# we need to assure unique row names
row.names(merged5) = 1:nrow(merged5)
}
}
}
cat( "---SECTION: ",s," : RUNNING REGRESSION---\n");flush.console()
merged5$ntm_count_t <- merged5$A + merged5$B + merged5$C
merged5$ntm_count_nt <- merged5$E + merged5$F + merged5$G + merged5$H + merged5$I #+ merged5$J + merged5$L + merged5$N  + merged5$O
### No D for some reason
#merged5[(merged5$ReporterISO3 %in% EUN)&(merged5$PartnerISO3 %in% EUN),c("ntm_count_t", "ntm_count_nt")] <- 0
reg1 <- lm_robust(log(price) ~
log(dist) + contig + comlang_off + rta + llocked +
applied + ntm_count_t +
ntm_count_nt +
factor(ProductCode)+
ReporterISO3 +
PartnerISO3
+ ReporterISO3*ntm_count_t
+ ReporterISO3*ntm_count_nt
,
cluster = dist, data = merged5, se_type = "stata")
summary_results <-  data.frame(summary(reg1)$coefficients[,c("Estimate","Pr(>|t|)")])
# Diagnostics -------------------------------------------------------------
#
#   head(merged5)
#
#   merged5[!complete.cases(merged5),]
#
#   write.csv(summary_results, "../summary_results.csv")
#
#   NAs <- rownames(summary_results[is.na(summary_results$Estimate),])
#
#
#   merged5[merged5$ReporterISO3=="BFA",]
#
#   write.csv(merged5[merged5$ReporterISO3=="CRI",], "../Reporter_BFA.csv")
#
# Diagnostics -------------------------------------------------------------
rownames_1 <- row.names(summary(reg1)$coefficients)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("[:]", rownames_1)]  )
summary_results <- summary_results[rownames_1,]
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("ntm_count_t", rownames_1)]  )
summary_results_t <- data.frame(summary_results[rownames_1, ])
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  rownames_1[grepl("ntm_count_nt", rownames_1)]  )
summary_results_nt <- data.frame(summary_results[rownames_1, ])
# format the list to store and then merge later on
summary_results_t_for_list <- summary_results_t
summary_results_t_for_list <- summary_results_t_for_list[!is.na(summary_results_t_for_list$Estimate),]
summary_results_t_for_list$reporter <- rownames(summary_results_t_for_list)
summary_results_t_for_list$reporter <- str_sub(summary_results_t_for_list$reporter,-3,-1)
rownames(summary_results_t_for_list) <- 1:nrow(summary_results_t_for_list)
summary_results_t_for_list <- summary_results_t_for_list %>% select(reporter, Estimate, `Pr...t..`) %>% rename(p=`Pr...t..`, beta_interaction = Estimate)
summary_results_nt_for_list <- summary_results_nt
summary_results_nt_for_list <- summary_results_nt_for_list[!is.na(summary_results_nt_for_list$Estimate),]
summary_results_nt_for_list$reporter <- rownames(summary_results_nt_for_list)
summary_results_nt_for_list$reporter <- str_sub(summary_results_nt_for_list$reporter,-3,-1)
rownames(summary_results_nt_for_list) <- 1:nrow(summary_results_nt_for_list)
summary_results_nt_for_list <- summary_results_nt_for_list %>% select(reporter, Estimate, `Pr...t..`) %>% rename(p=`Pr...t..`, beta_interaction = Estimate)
for(j in 1:nrow(subsetx))
{
hs_codej <- subsetx[j,"hs6"]
#cat( "--- Addint to the list of interactions ---\n");flush.console()
hs_beta_list[[hs_codej]] <- summary_results_t_for_list
hs_beta_list_nt[[hs_codej]] <- summary_results_nt_for_list
}
rm(summary_results_t_for_list, j)
#attributes(summary(reg1))
results_summary[s,"section" ] <- sections[s]
results_summary[s,"ntm_count_t_b" ] <- summary(reg1)$coefficients[c("ntm_count_t"),"Estimate"]
results_summary[s,"ntm_count_t_p" ] <- summary(reg1)$coefficients[c("ntm_count_t"),"Pr(>|t|)"]
summary_results <-  data.frame(summary(reg1)$coefficients[,c("Estimate","Pr(>|t|)")])
rownames_1 <- row.names(summary_results)
rownames_1 <- which(rownames_1  %in%  "ntm_count_nt")
if (length(rownames_1)>0){
results_summary[s,"ntm_count_nt_b" ] <- summary(reg1)$coefficients[c("ntm_count_nt"),"Estimate"]
results_summary[s,"ntm_count_nt_p" ] <- summary(reg1)$coefficients[c("ntm_count_nt"),"Pr(>|t|)"]
}
results_summary[s,"ntm_count_t_int" ] <- nrow(summary_results_t)
results_summary[s,"ntm_count_t_int_sign" ] <- nrow(summary_results_t[summary_results_t[,2]<0.1&!is.na(summary_results_t[,2]),])
results_summary[s,"ntm_count_nt_int" ] <- nrow(summary_results_nt)
results_summary[s,"ntm_count_nt_int_sign" ] <- nrow(summary_results_nt[summary_results_nt[,2]<0.1&!is.na(summary_results_nt[,2]),])
results_summary[s,"n" ] <- nrow(merged5)
results_summary[s,"r" ] <- summary(reg1)$r.squared
results_summary[s,"adj" ] <- summary(reg1)$adj.r.squared
}
