***************************************************************************
***************************************************************************

* Taxi Data Application for the paper:

* "Linear IV Regression Estimators for Structural Dynamic Discrete Choice Models"

* by Myrto Kalouptsidi (Harvard), Paul Scott (NYU), and Eduardo Souza-Rodrigues (UToronto)

* This version: February, 2020

***************************************************************************
***************************************************************************


***************************************************************************
* 1. PREPARE DATA
***************************************************************************

* Set Path

*global BasePath "C:\FILE"
chdir "$BasePath"
clear 
set more off

* Load Data Set	
use TaxiEmpirics_Data.dta, clear

* Check Number of Observations
distinct driver day hourofday t
 
* Generate daily shift indicator
sort driver shift
by driver shift: gen dayshift = 1 if (first_time>=5  & first_time<=14) & (last_time>= 8 & last_time<=17)

* Data Selection: Daily Shift & Working Hours (1<=k<=11 and max_k <= 12)
keep if dayshift == 1 & k>=1 & k<=11 & max_k<=12 

* Save Changes
save TaxiEmpirics_Data2.dta, replace

* Generate Market-Level State Variable: w = average hourly earnings if working
sort t
by t: egen w = mean(hourlyearnings) if a==1 & max_k<=12
*by t: egen w = median(hourlyearnings) if a==1 & max_k<=12

* Prepare Time-Series data to construct Instrumental Variables
keep t w hourofday

* Collapse Data
collapse w hourofday, by(t)

* Time-series setting
tsset t

* Generate Lagged Earnings as IVs
* Previous hour
gen z1 = L.w
* Previous day (same hour)
gen z2 = L24.w
* Previous week (same hour, same day of the week) -- OBS: DELETE
gen z3 = L168.w

* Save Earnings Data
save timeseries_wz.dta, replace
clear 

* Load Main Data Set
clear
use TaxiEmpirics_Data2.dta, clear 

* Sort by hour
sort t

* Merge Main Data with Earnings data
merge m:1 t using timeseries_wz.dta
drop _merge

* Panel Data Structure
sort driver t
xtset driver t

* Order Variables
order driver t day month dayinmonth hourofday a k shift first last dayshift max_k k2 k3 k4 w z1 z2 z3 dow hourlyearnings

* Select days of the week (Monday--Thursday)
keep if dow>=1 & dow<=4

* Check Number of Observations in the Selected Sample
distinct driver day hourofday t

* Table 4: Summary Stats
tab hourofday, sum(a)
tab hourofday, sum(w)



***************************************************************************
* 2. FIRST STAGE -- ESTIMATE CCPS
***************************************************************************

* Generate CCPs
forvalues tt = 8/16 {

	* Estimate Flexible Logit
	display "Running Logit for Hour of the Day = `tt' "
	qui logit a k k2 k3 k4 i.month i.dayinmonth unweek fourthofjuly memorialday newyearsday i.weather if hourofday==`tt' 
	predict xb`tt', xb

	* Calculate CCPs for Each Hour `tt' and Each State k > 1
    local upper = `tt'-4
	forvalues kk = 4/`upper' {
	
		display "Calculating CCP for k = `kk' "
	
		* Predicted xb for each k
		gen xb`tt'_`kk' = xb`tt' - _b[k]*k - _b[k2]*k2 - _b[k3]*k3 - _b[k4]*k4 + _b[k]*`kk' + _b[k2]*`kk'^2 + _b[k3]*`kk'^3 + _b[k4]*`kk'^4
		
		* CCPs given k
		gen p`tt'_`kk' = exp(xb`tt'_`kk')/(1+exp(xb`tt'_`kk'))
		
	}
	
}
drop xb*


***************************************************************************
* 2. SECOND STAGE -- ESTIMATE MODEL PARAMETERS
***************************************************************************

* Generate Dependent Variable, Y_mt(k)
forvalues tt = 8/16 {

    local upper = `tt'-5
	forvalues kk = 4/`upper' {
	
		* Forward t and k
		local ft = `tt' + 1	
		local fk = `kk' + 1	
		
		* Calculate Yt(k)
		if `tt' <= 15 {
		
			gen yk`kk'_`tt' = ln(p`tt'_`kk'/(1-p`tt'_`kk')) + 0.999999 * ln(p`ft'_`fk') 
		
		}
		else if `tt' == 16 {
		
			gen yk`kk'_`tt' = ln(p`tt'_`kk'/(1-p`tt'_`kk'))
		
		}
		
		}
}

* Collapse Data, day m and hour t
collapse y* w z* month dayinmonth hourofday unweek fourthofjuly memorialday newyearsday weather, by(day t)
distinct day hourofday t 
xtset day t
order day t month dayinmonth hourofday w z* y*

* Reshape Data: Stack hours of the day 
reshape long yk4_ yk5_ yk6_ yk7_ yk8_ yk9_ yk10_ yk11_ , i(day t) j(hour)
foreach var in yk4 yk5 yk6 yk7 yk8 yk9 yk10 yk11 {
	rename `var'_ `var'
}
* Reshape Data: Stack State k
reshape long yk, i(day t hour) j(kk)

* Final Adjustments
rename yk y
egen id = group(day t kk)
egen dk = group(day kk) 
sort id hour
xtset id hour

* Run ECCP Second Stage Regressions
* ECCP OLS
eststo clear
eststo: qui reg y w kk unweek fourthofjuly memorialday newyearsday i.weather i.hourofday, cluster(dk)
dis _b[kk]/_b[w]

* ECCP 2SLS: IV = Within-Day One-hour Lagged Earnings
eststo: qui ivregress 2sls y (w = z1) kk unweek fourthofjuly memorialday newyearsday i.weather i.hourofday, cluster(dk)
dis _b[kk]/_b[w]
weakivtest

* ECCP 2SLS: IV = Previous Day Earnings
eststo: qui ivregress 2sls y (w = z2) kk unweek fourthofjuly memorialday newyearsday i.weather i.hourofday, cluster(dk)
dis _b[kk]/_b[w]
weakivtest

* Table 5: Parameter Estimates
	esttab est1 est2 est3, se noobs ///
	mtitles("OLS" "Within Day IV" "Previous Day IV" ) ///
	drop(unweek fourthofjuly memorialday newyearsday 1.weather 2.weather 3.weather 4.weather 5.weather 6.weather 7.weather 9.weather 10.weather ///
	6.hourofday 7.hourofday 8.hourofday 9.hourofday 10.hourofday 11.hourofday 12.hourofday 13.hourofday 14.hourofday 15.hourofday 16.hourofday 17.hourofday _cons) nonotes ///
	order(w kk) coeflabel(w "Market Hourly Earnings, w" kk "Hours Driving, k") ///		
	addnote(\pbox{13cm}{\vspace{1mm} Notes: The unit of observation is an hour ($t$) of a day ($m$) in 2013. The dependent variable is the log odds ratio of the conditional choice probabilities of continue working vs stopping at $t$ plus the discounted choice probability of stop working at $t+1$. The first column presents the OLS estimates. The second column, the 2SLS estimates using the within-day lagged earnings as an instrument. And the third column, the 2SLS estimates using the earnings in the same hour of the day but in the previous day as instrument. Robust Standard errors in parentheses are clustered at the 'market' level $m$ (day).})

