#Set Wd and read csv file
setwd("/Users/pedroromao1/Desktop/UN/R")
read.csv("/Users/pedroromao1/Desktop/UN/R/FILENAME")

#Ask for help
?

#Create df1 from CSV
df1 <- read.csv("NAME OF THE FILE.CSV",
                stringsAsFactors = FALSE)

#Clean all data
rm(list = ls(all.names=TRUE))

#Get names of variables
names()
labels() 
head(,10) # top 10 rows
attributes(gd)$names

#Class of a variable
class()

#Transform df$1 from factor to numeric
df$1 <- as.numeric(as.character(df$1))

#Summary of a variable
summary()

#Refer to multiple variables/columns (wherever)
c() # 

#Understand what things you can call from a function, variable, df etc.
attributes(df) # from a dataset
attributes(lm(y ~ x, data=df)) # from a regression

#Count collumns, rows and specific occurences
length(df) # horizontal - number of variables
legth(df$1) # if a variables is defined, it counts vertically
lentgh(df$1[is.na(df$1)]) # subsetting for NA will count vertically as long as a variable is defined
lentgh(df$1[df$1==0] # subsetting for 0
nrow(df) # alternative way to count rows (also allows subsetting)

#Order data
df_ordered <- df[order(df$1)]

# Selecting observations in df1 that exist in df2 
df3 <- df1 %in% df2

#Upload dyplr and tidyr for data viewing
library("dplyr")
library("tidyr")
       
#To arrange df as easy for viewing
df <- as_data_frame(df)
df #call the df  

#Within dyplr - data from df1 wide to long or opposite in df2
df2 <- gather(df1,x,y,...) #from wide to long
df2 <- spread(df1,x,y,..) #from long to wide

#Within dyplr - group by variable and execute a command
df2 <- df1 %>% 
  group_by(x,y,z) %>% 
  summarize(NewColumn_df2 = mean(value)) # for statistic
               #or (after group_by)
  count()
       
#Regressions
lm(y ~ x, data=df)
lm(y ~ x,
   I(x^2), # first order polinomial (squared)
   I(z*y), # interaction term
   data=df
   )

#For Heteroskedasticity test:
library("lmtest")
library("sandwich")
coeftest(reg1, vcov=vcovHC(reg1, type="HC1"))

#STATA-like clustered std. errors:
library(estimatr)
lm_robust(y ~ x,
          data = df1,
          clusters = x, se_type= "stata")