## ECON 370 LAB 3:  DATA VISUALIZATION 
## NAME:  
## DATE:  


# preliminaries ----------------------------------------------------------------

## install packages / load libraries as needed
## there are quite a few new R packages that we'll be using in this lab

# install.packages("tidyverse") # assuming you've got this installed
install.packages("haven") # to load data in Stata's .dta form
install.packages("fixest") # to run regressions w/ heteroskedasticty-robust SEs
install.packages("broom") # to clean up regression results
install.packages("ggokabeito") # to easily access the Okabe-Ito colorblind-friendly palette

library(tidyverse)
library(haven)
library(fixest)
library(broom)
library(ggokabeito)

## Or in Python: 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # graphs
import seaborn as sns # graphs
import os
import statsmodels.api as sm # regressions
import statsmodels.formula.api as smf # regressions

## I like to load the Okabe-Ito colors by hand
## Replace -< with equals signs for Python
oigreen <- "#009E73"
oiblue <- "#0072B2"
oiverm <- "#D55E00"
oipurple <- "#CC79A7"
oiyellow <- "#F0E442"
oiorange <- "#E69F00"


## specify the file path to your folder for this lab
## R and Python below, delete the one you are not using

## In R:
username <- Sys.getenv("USERNAME")
pjpath <- paste0("C:/Users/", username, "/Dropbox/ECON-370/topics/3a-visualization/lab/")
yourpath <- "[YOUR FILE PATH]"
mypath <- if_else(username == "pj" | username == "pjaki", pjpath, yourpath)

## In Python:

username = os.getenv("USERNAME")
pjpath = f"C:/Users/{username}/Dropbox/ECON-370/topics/3a-visualization/lab/py/"
yourpath = f"C:/Users/{username}/Dropbox/ECON-370/topics/3a-visualization/lab/py/"

if username == "pj" or username == "pjaki":
    mypath = pjpath
else:
    mypath = yourpath

print(mypath)


# load data --------------------------------------------------------------------

## obtain the stata data file gemdata.dta from the replication files for 
##   "A Firm of One's Own:  Experimental Evidence on Credit Constraints and Occupational Choice"
##   which are available from the Harvard dataverse here:
##   https://dataverse.harvard.edu/file.xhtml?fileId=10361258&version=1.0

## select the variables characterizing randomly-assigned treatments (control, gem, cash)
##   as well as the variables indicating whether someone was self-employed in each survey round
##   (y1_selfemp, y2_selfemp, y6_selfemp)

## generate a treatment column in your data frame / tibble that takes on the values 
##   Control, Franchise, and Grant where Franchise is the GEM treatment and Grant is the cash treatment

## R hint:  read_dta() from the haven package
## Python hint:  pd.read_stata()


# run regressions --------------------------------------------------------------------

## to run regressions with heteroskedasticity-robust standard errors 
##   (which is the only kind worth running!) we cannot use lm()
##   There are many ways to do this, but I recommend using feols() from the fixest package


## this command will run a regression of Y1_selfemp on the dummies for the gem and cash treatments
##   generating heteroskedasticity-robust SEs (identical to stata's robust option)
##   NOTE: you'll get an alert indicating that some observations were dropped because of missing data
##   these are individuals who attritors (attrittors?) who did not complete the follow-up surveys

## In R: y1.ols <- feols(y1_selfemp ~ gem + cash, vcov = "HC1", data = gemdata)
## In Python:  y1_ols = smf.ols('y1_selfemp ~ gem + cash', data=gemdata).fit(cov_type='HC1')


## R hint: you can then use broom's tidy() command to store a nice little tibble of results from y1.ols
## Python hint:  define a data frame of results which are in y1_ols.params, y1_ols.bse, y1_ols.pvalues

## extend the code to do this for the Year 2 and Year 6 data as well, combine the tibbles, and 
##   create a character/string variable indicating whether results are from Year 1, Year 2, or Year 3

## Once you combine the results into a single data frame, you will also need to create two additional columns:
##   the upper and lower limits of the 95% confidence interval, which are 
##   the coefficient - 1.96*SE and the coefficient + 1.96*SE


# make your graph --------------------------------------------------------------------

## Now try to replicate my graph.
## Set the default aesthetics: label is on the x axis, estimate is on the y axis, 
##   term is the fill aesthetic (so the bars are different colors for the two treatments)