*****************
*** LECTURE 2 ***
*****************


* 1.	Preliminary steps
cd "C:\Desktop\lezione_2"
log using "lecture2.log", replace


* 2.	HYPOTHESIS TESTING AND UNIVARIATE REGRESSION

* import an excel file on stata
import excel "dati_esercizio.xls", sheet("Sheet1") firstrow clear

describe
label var edu_father "=1 se padre con diploma o laurea"
label var donna "=1 se donna"

* - Do women earn less than men?

ttest salario_mensile, by(donna)

*- Do the childern of more educated fathers earn more?
ttest salario_mensile, by(edu_father)

/*
- Do women with highly educated fathers earn more than men with poorly educated fathers?
  (It is recommended to preliminarily create a variable =1 for women with educated fathers and for men
  with poorly educated fathers, =0 in other cases. Then use "ttest" with an "if" condition and the "by" option) 
*/

gen selezione=0
replace selezione=1 if donna==1 & edu_father==1
replace selezione=1 if donna==0 & edu_father==0

ttest salario_mensile if selezione==1, by(donna)


/* 
Estimate a regression in which the dependent variable is monthly salary and the
independent is age. Does getting one year older affect wages? What is the size of this effect?
*/

reg salario_mensile age


* 3.	EXPOLORATIVE ANALYSIS OF RAPPORTI DI LAVORO DATABASE AND APPEND

use rapporti_lavoro_2000.dta, clear
des
tab anno

use rapporti_lavoro_2001.dta, clear
des
tab anno

append using rapporti_lavoro_2000.dta
append using rapporti_lavoro_1999.dta

des
tab anno
save panel_rl.dta, replace

drop if anno==1999
tab anno

tab anno, sum(tempo_d)
tab tempo_d if anno==2000
tab tempo_d if anno==2001

prtest tempo_d, by(anno)

* 4.	ADD VARIABLES WITH MERGE COMMAND AND REGRESSION ANALYSIS
use anagrafica_soggetti.dta, clear
des

use rapporti_lavoro_2001.dta, clear
des

merge 1:1 id_soggetto using anagrafica_soggetti.dta
des
tab _m
keep if _m==3
drop _m

* create the variable age in years
gen age = anno - anno_nascita

su retrib03 age

* linear regression
reg retrib03 age

predict y_hat

/*
this is a trick to make a graph file size smaller:
scatter prints a point for each nonmissing observation
since we have 600'000 observations, this is very heavy and slow
y_hat has the same value for each age level, thus
we need only one point per age value.
SOLUTION:
- create a variable (we call it "step"") =1 in the first row of each age value, 0 otherwise
- do the scatterplot only for obsevations where step=1
*/
bys age: gen step=1 if _n==1

scatter y_hat age if step==1

gen age2=age^2

reg retrib03 age age2

predict y_hat2

* the variable step is useful also for this graph
twoway (scatter y_hat2 age if step==1) (scatter y_hat age if step==1), ///
legend(order(1 "Linear model" 2 "Quadratic model"))

* marginal effect of moving from 39 to 40 years old
ttest y_hat if age==39|age==40, by(age)

ttest y_hat2 if age==39|age==40, by(age)


* marginal effect of moving from 29 to 30 years old
ttest y_hat if age==29|age==30, by(age)

ttest y_hat2 if age==29|age==30, by(age)


* we calculate the average of the actual wages and predictions for each age:
* the command generates a new database with one observation per age.
* the variables in the new database contain the averages of retrib03 y_hat y_hat2 by age
collapse (mean) retrib03 y_hat y_hat2, by(age)

twoway (connected retrib03 age, msize(vsmall)) ///
(connected y_hat age, msize(vsmall)) ///
(connected y_hat2 age, msize(vsmall))


* we can improve the graph appearance (twoway has a lot of options to play with!)
twoway (connected retrib03 age, msize(vsmall)) ///
(connected y_hat age, msize(vsmall)) ///
(connected y_hat2 age, msize(vsmall)), ///
legend(order(1 "Actual wages" 2 "Linear prediction" 3 "Quadratic prediction")) ///
title("Actual average and predicted wage by age") ///
ytitle("Wage level") xtitle("Age")


* 5.	close the log file
log close