Chapter 4 Appendix: Importing the original datasets

This section can be a bit obscure. It is only included to make the datasets importing steps transparent. It is important to mention that we converted the datasets from DTA (Stata, closed source format) to Arrow Parquet (cross-language, open-source).

The decision to use Arrow instead of CSV/TSV is that Arrow files are always read with the correct column specification (i.e., a column with values such as “00123” is always read as a string and is never confused as a numeric).

4.1 Downloading the original datasets

appfiles_url <- "https://vi.unctad.org/tpa/web/zips/vol2/Advanced%20Guide%20to%20TPA.zip"
appfiles_zip <- "00-application-files.zip"
appfiles_dir <- "00-application-files"

if (!file.exists(appfiles_zip)) {
  download.file(appfiles_url, appfiles_zip)
}

if (!dir.exists(appfiles_dir)) {
  unzip(appfiles_zip)
  file.rename("Advanced Guide to TPA", appfiles_dir)
}

4.2 Converting the original datasets

# these packages are only used to import the data
library(haven)
library(arrow)
library(stringr)
library(janitor)
library(purrr)

try(dir.create("data-parquet", showWarnings = F))

dta_files <- list.files("00-application-files",
                        pattern = "dta",
                        full.names = TRUE,
                        recursive = TRUE)

read_and_clean <- function(finp) {
  message(finp)
  
  fout <- finp %>% 
    str_replace(appfiles_dir, "") %>% 
    str_replace("Chapter", "ch") %>% 
    str_replace_all("Chapter[0-9]|\\.dta", "")
  
  fout <- fout %>% 
    str_replace_all("(/)", "_") %>% 
    make_clean_names()
  
  long_names <- c(
    "datasets_",
    "applications_",
    "exercises_",
    "1_trade_without_border_results_1",
    "2_rt_as_effects_results_2_"
  )
  
  fout <- fout %>% 
    str_replace_all(paste(long_names, collapse = "|"), "")
  
  fout <- str_replace(fout, "_([0-9])_|__", "_")

  fout2 <- sprintf("data-parquet/%s.parquet", fout)
  
  if (!file.exists(fout2)) {
    read_dta(finp) %>% 
      clean_names() %>% 
      write_parquet(fout2)
  }
}

map(dta_files, read_and_clean)