/* 
This is stata code for defining Epilepsy using administrative data
The algorithm is published in Tonelli et al BMC Med Inform Dec Making 2015;15:31
and based on the work in Jette et al Epilepsia 2010;51:62-69

1)  Specify the file name for the hospitalization data
    Define the variable names for the ICD-9 codes as hosp_icd9_code
	Define the maximum number of ICD-9 diagnosis code position as num_hosp_icd9
	Define the variable name for ICD-10 codes as hosp_icd10_code
	Define the maximum number of ICD-10 diagnosis code position as num_hosp_icd10
    Define the variable name of the hospitalization start date as hosp_start_date
	Define the variable names of the types of hospitalization as type
	Define the value for the most responsible diagnosis code
	
2)  Specify the file name for the physician claims data 
    Define the variable names for the ICD-9 codes as claim_icd9_code
	Define the maximumf number of ICD-9 diagnosis code position as num_claim_icd9
    Define the variable name of the physician claims date as claim_start_date
   
3)  Specify the file name for the ACCS data 
    Define the variable names for the ICD-9 codes as accs_icd9_code
	Define the maximum number of ICD-9 diagnosis code position as num_accs_icd9
	Define the variable names for ICD-10 codes as accs_icd10_code
	Define the maximum number of ICD-10 diagnosis code position as num_accs_icd10
    Define the variable name of the ACCS start date as accs_start_date
	Define the variable name of MIS functional center code as accs_start_date
	Define a value for the most responsible diagnosis code. The position for
	this code is hardcoded as 1
	
4)  Specify the directory of the output dataset

5)  Create study cohort which includes the unique patient identifier, the study 
	start and end dates (e.g., when a patient turns 18 until they die or out-migrate) 
*/

* File name and variable names for the hospitalizations dataset
global data_hosp = "G:\Open Data\ICDC 2013\ICDC Source\Constant files\hosp94_2013.dta"
global hosp_icd9_code = "hosp_icd9dx_code"
global num_hosp_icd9 = 16
global hosp_icd10_code = "hosp_icd10dx_code"
global num_hosp_icd10 = 25
global hosp_start_date = "start_date"
global type = "diag_type"
global hosp_most_responsible = "M"
	
* File names and variable names for the physician claims dataset
global data_claim = "G:\Open Data\ICDC 2013\ICDC Source\claims_94_13.dta"
global claim_icd9_code = "hlth_dx_icd9x_code_"
global num_claim_icd9 = 3
global claim_start_date = "start_date"

* File names and variable names for the ACCS dataset
global data_accs = "G:\Open Data\ICDC 2013\ICDC Source\Constant files\accs97_2013.dta" 
global accs_icd9_code = "accs_icd9dx_code"
global accs_icd10_code = "accs_icd10dx_code"
global accs_start_date = "start_date"
global accs_most_responsible = "M"

* Output dataset directory
global out_source = "G:\Projects\Phoebe\MM30codes\kidtran" 

/*
	Sample program call
	ICDmm30_epilepsy cohort akdnid study_start_date study_end_date 
		'cohort' is the file name of the study cohort
		'id' is the unique patient identifier
		'study_start_date' is the study start date
		'study_end_date' is the study end date
*/

version 13.1
capture program drop ICDmm30_epilepsy
program ICDmm30_epilepsy
set more off
gettoken cohort 0:0
gettoken id 0:0
gettoken study_start_date 0:0
gettoken study_end_date 0:0


use `cohort', clear
duplicates drop
sort `id', stable
save cohort_epilepsy, replace


* Epilepsy
local ICD9_epilepsy 345
local ICD10_epilepsy G40 G41

* One hospitalization
use "${data_hosp}" , clear
	keep `id' ${hosp_start_date} ${hosp_icd9_code}1-${hosp_icd9_code}${num_hosp_icd9} ${hosp_icd10_code}1-${hosp_icd10_code}${num_hosp_icd10} ///
	${type}1- ${type}16  ${type}17- ${type}${num_hosp_icd10}
	recast long ${hosp_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort
	keep if _merge==3
	drop _merge
	keep if ${hosp_start_date}>=`study_start_date' & ${hosp_start_date}<=`study_end_date'
	
	local n=${num_hosp_icd9}
	forvalues nhos9=1(1)`n' {
	 capture replace ${hosp_icd9_code}`nhos9'= subinstr(${hosp_icd9_code}`nhos9', ".", "",.) 
	   }
	local m=${num_hosp_icd10}	
	forvalues nhos10=1(1)`m' {
	   capture replace ${hosp_icd10_code}`nhos10'= subinstr(${hosp_icd10_code}`nhos10', ".", "",.) 
	   }
 
	gen type=""
	gen code=""	
	local n=${num_hosp_icd9}
	foreach i in `ICD9_epilepsy'   {
		local len=strlen("`i'")
		forvalues dx=`n'(-1)1 {
			 capture replace code="`i'" if substr(${hosp_icd9_code}`dx',1,`len') =="`i'" 
			 capture  replace type=${type}`dx' if substr(${hosp_icd9_code}`dx',1,`len') =="`i'" 
		}
	}	
	
	local m=${num_hosp_icd10}	
	foreach j in `ICD10_epilepsy' {
		local len=strlen("`j'")
		forvalues dx=`m'(-1)1 {			
		   capture replace code = "`j'" if substr(${hosp_icd10_code}`dx',1,`len') == "`j'"
		   capture  replace type=${type}`dx' if substr(${hosp_icd10_code}`dx',1,`len')== "`j'"
		}
	}
	
	keep if code!=""
	keep if type=="${hosp_most_responsible}"
	keep `id' ${hosp_start_date}  code
	gen source="hosp"
	keep `id' ${hosp_start_date}  code source
	rename ${hosp_start_date} epilepsy_date
	sort `id' epilepsy_date, stable
    bysort `id': keep if _n==1
save "${out_source}\ICD_epilepsy.dta", replace

* Two claims within 2 years
use "${data_claim}",clear
	keep `id' ${claim_start_date} ${claim_icd9_code}1-${claim_icd9_code}${num_claim_icd9}	
	recast long ${claim_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort_epilepsy
	keep if _merge==3
	drop _merge
	keep if ${claim_start_date}>=`study_start_date' & ${claim_start_date}<=`study_end_date'
	
	local n=${num_claim_icd9}
	forvalues nclaim9=1(1)`n' {
	   capture replace ${claim_icd9_code}`nclaim9'= subinstr(${claim_icd9_code}`nclaim9', ".", "",.) 
	   } 
	   
	gen code=""		
	foreach i in `ICD9_epilepsy' {
		local len=strlen("`i'")
		local n=${num_claim_icd9}
		forvalues dx=`n'(-1)1 {
			capture replace code="`i'" if substr(${claim_icd9_code}`dx',1,`len') =="`i'" 
		}
	}
	
	keep if code!=""
	sort `id' ${claim_start_date}, stable
	gen source="claim"
	keep `id' ${claim_start_date}  code source
	rename ${claim_start_date} epilepsy_date
    
    sort `id' epilepsy_date
    bysort `id': gen claim=1 if  epilepsy_date[_n+1]<=(epilepsy_date[_n]+round(365.25*2,1))

	keep if claim==1
    drop claim
	bysort `id': keep if _n==1

	* Append hospitalizations with claims
	append using "${out_source}\ICD_epilepsy.dta"
	sort `id' epilepsy_date, stable
save "${out_source}\ICD_epilepsy.dta", replace
	
* One ACCS
use "${data_accs}",clear
	keep `id' ${accs_start_date} ${accs_icd9_code}1 ${accs_icd10_code}1
	recast long ${accs_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort

	keep if _merge==3
	drop _merge
	keep if ${accs_start_date}>=`study_start_date' & ${accs_start_date}<=`study_end_date'


	capture replace ${accs_icd9_code}1= subinstr(${accs_icd9_code}1, ".", "",.) 
	capture replace ${accs_icd10_code}1= subinstr(${accs_icd10_code}1, ".", "",.) 

	 
	gen code=""	

	foreach i in `ICD9_epilepsy' {
		local len=strlen("`i'")
			capture replace code="`i'" if substr(${accs_icd9_code}1,1,`len') =="`i'" 
	}
	
		foreach i in `ICD10_epilepsy' {
		local len=strlen("`i'")
			capture replace code="`i'" if substr(${accs_icd10_code}1,1,`len') =="`i'" 
	}
	
	keep if code!=""
	gen type="${accs_most_responsible}" //PY modified
	sort `id' ${accs_start_date}, stable
	gen source="accs"
	keep `id' ${accs_start_date}  code source
	rename ${accs_start_date} epilepsy_date
	
	sort `id' epilepsy_date
	bysort `id': keep if _n==1
	* Append hospitalizations and claims with accs
	append using "${out_source}\ICD_epilepsy.dta"
	sort `id' epilepsy_date, stable
	
	sort `id' epilepsy_date 
	
	bysort `id': keep if _n==1
	gen epilepsy=1
	keep `id' epilepsy_date  epilepsy
	sort `id' epilepsy_date  
save "${out_source}\ICD_epilepsy.dta", replace

erase cohort_epilepsy.dta 
end