#Set Wd and read csv file setwd("/Users/pedroromao1/Desktop/UN/R") read.csv("/Users/pedroromao1/Desktop/UN/R/FILENAME") #Ask for help ? #Create df1 from CSV df1 <- read.csv("NAME OF THE FILE.CSV", stringsAsFactors = FALSE) #Clean all data rm(list = ls(all.names=TRUE)) #Get names of variables names() labels() head(,10) # top 10 rows attributes(gd)$names #Class of a variable class() #Transform df$1 from factor to numeric df$1 <- as.numeric(as.character(df$1)) #Summary of a variable summary() #Refer to multiple variables/columns (wherever) c() # #Understand what things you can call from a function, variable, df etc. attributes(df) # from a dataset attributes(lm(y ~ x, data=df)) # from a regression #Count collumns, rows and specific occurences length(df) # horizontal - number of variables legth(df$1) # if a variables is defined, it counts vertically lentgh(df$1[is.na(df$1)]) # subsetting for NA will count vertically as long as a variable is defined lentgh(df$1[df$1==0] # subsetting for 0 nrow(df) # alternative way to count rows (also allows subsetting) #Order data df_ordered <- df[order(df$1)] # Selecting observations in df1 that exist in df2 df3 <- df1 %in% df2 #Upload dyplr and tidyr for data viewing library("dplyr") library("tidyr") #To arrange df as easy for viewing df <- as_data_frame(df) df #call the df #Within dyplr - data from df1 wide to long or opposite in df2 df2 <- gather(df1,x,y,...) #from wide to long df2 <- spread(df1,x,y,..) #from long to wide #Within dyplr - group by variable and execute a command df2 <- df1 %>% group_by(x,y,z) %>% summarize(NewColumn_df2 = mean(value)) # for statistic #or (after group_by) count() #Regressions lm(y ~ x, data=df) lm(y ~ x, I(x^2), # first order polinomial (squared) I(z*y), # interaction term data=df ) #For Heteroskedasticity test: library("lmtest") library("sandwich") coeftest(reg1, vcov=vcovHC(reg1, type="HC1")) #STATA-like clustered std. errors: library(estimatr) lm_robust(y ~ x, data = df1, clusters = x, se_type= "stata")