/* 
This is stata code for defining Strokes and TIA using administrative data
The algorithm is published in Tonelli et al BMC Med Inform Dec Making 2015;15:31
and based on the work in Kokotailo et al Stroke 2005;36:1776-1781

1)  Specify the file name for the hospitalization data
    Define the variable names for the ICD-9 codes as hosp_icd9_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-9 diagnosis code position as num_hosp_icd9
	Define the variable name for ICD-10 codes as hosp_icd10_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-10 diagnosis code position as num_hosp_icd10
    Define the variable name of the hospitalization start date as hosp_start_date
	Define the variable names of the types of hospitalization as type
	Define the value for the most responsible diagnosis code and the post-admittance diagnosis code
	
2)  Specify the file name for the physician claims data 
    Define the variable names for the ICD-9 codes as claim_icd9_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-9 diagnosis code position as num_claim_icd9
    Define the variable name of the physician claims date as claim_start_date
   
3)  Specify the file name for the ACCS data 
    Define the variable names for the ICD-9 codes as accs_icd9_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-9 diagnosis code position as num_accs_icd9
	Define the variable names for ICD-10 codes as accs_icd10_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-10 diagnosis code position as num_accs_icd10
    Define the variable name of the ACCS start date as accs_start_date
	Define the list for the emergency codes
	Define a value for the most responsible diagnosis code. The position for
	this code is hardcoded as 1
	
4)  Specify the directory of the output dataset

5)  Create study cohort which includes the unique patient identifier, the study 
	start and end dates (e.g., when a patient turns 18 until they die or out-migrate)
*/

* File name and variable names for the hospitalizations dataset
global data_hosp = "G:\Open Data\ICDC 2013\ICDC Source\Constant files\hosp94_2013.dta"
global hosp_icd9_code = "hosp_icd9dx_code"
global num_hosp_icd9 = 16
global hosp_icd10_code = "hosp_icd10dx_code"
global num_hosp_icd10 = 25
global hosp_start_date = "start_date"
global type = "diag_type"
global hosp_most_responsible = "M"
global hosp_post_admittance = "2"
	
* File name and variable names for the physician claims dataset
global data_claim = "G:\Open Data\ICDC 2013\ICDC Source\claims_94_13.dta"
global claim_icd9_code = "hlth_dx_icd9x_code_"
global num_claim_icd9 = 3
global claim_start_date = "start_date"

* File name and variable names for the ACCS dataset
global data_accs = "G:\Open Data\ICDC 2013\ICDC Source\Constant files\accs97_2013.dta" 
global accs_icd9_code = "accs_icd9dx_code"
global num_accs_icd9 = 6
global accs_icd10_code = "accs_icd10dx_code"
global num_accs_icd10 = 10
global accs_start_date = "start_date"
global accs_mis_code = "mis_code" 
global accs_ed_mis_code = "713100000 713102000 71310 7131020"
global accs_most_responsible = "M"


* Output dataset directory
global out_source = "G:\Projects\Phoebe\MM30codes\kidtran" 

/*
	Sample program call
	ICDmm30_stroke cohort akdnid study_start_date study_end_date 
		'cohort' is the file name of the study cohort
		'id' is the unique patient identifier
		'study_start_date' is the study start date
		'study_end_date' is the study end date
*/

version 13.1
capture program drop ICDmm30_stroke
program ICDmm30_stroke
set more off
gettoken cohort 0:0
gettoken id 0:0
gettoken study_start_date 0:0
gettoken study_end_date 0:0


use `cohort', clear
duplicates drop
sort `id', stable
save cohort_stroke, replace


* Stroke or TIA
local ICD9_stroke 3623 430 431 43301 43311 43321 43331 43381 43391 43401 43411 43491 435 436
local ICD10_stroke G450 G451 G452 G453 G458 G459 H341 I60 I61 I63 I64


* One hospitalization
use "${data_hosp}" , clear
	keep `id' ${hosp_start_date} ${hosp_icd9_code}1-${hosp_icd9_code}${num_hosp_icd9} ${hosp_icd10_code}1-${hosp_icd10_code}${num_hosp_icd10} ///
	${type}1- ${type}16  ${type}17- ${type}${num_hosp_icd10}
	recast long ${hosp_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort_stroke
	keep if _merge==3
	drop _merge
	keep if ${hosp_start_date}>=`study_start_date' & ${hosp_start_date}<=`study_end_date'
	
	local n=${num_hosp_icd9}
	forvalues nhos9=1(1)`n' {
	 capture replace ${hosp_icd9_code}`nhos9'= subinstr(${hosp_icd9_code}`nhos9', ".", "",.) 
	   }
	local m=${num_hosp_icd10}	
	forvalues nhos10=1(1)`m' {
	   capture replace ${hosp_icd10_code}`nhos10'= subinstr(${hosp_icd10_code}`nhos10', ".", "",.) 
	   }
	   
	gen type=""
	gen code=""	
	local n=${num_hosp_icd9}
	local m=${num_hosp_icd10}
	foreach i in `ICD9_stroke'   {
		local len=strlen("`i'")
		forvalues dx=`n'(-1)1 {
			 capture replace code="`i'" if substr(${hosp_icd9_code}`dx',1,`len') =="`i'" 
			 capture  replace type=${type}`dx' if substr(${hosp_icd9_code}`dx',1,`len') =="`i'" 
		}
	}		
	
	foreach j in `ICD10_stroke' {
		local len=strlen("`j'")
		forvalues dx=`m'(-1)1 {			
		   capture replace code = "`j'" if substr(${hosp_icd10_code}`dx',1,`len') == "`j'"
		   capture  replace type=${type}`dx' if substr(${hosp_icd10_code}`dx',1,`len')== "`j'"
		}
	}
	
	
	keep if code!=""
	keep if  type=="${hosp_most_responsible}" | type=="${hosp_post_admittance}"
	keep `id' ${hosp_start_date} type code
	gen source="hosp"
	keep `id' ${hosp_start_date} type code source
	rename ${hosp_start_date} stroke_date
	sort `id' stroke_date, stable
    bysort `id': keep if _n==1	
save "${out_source}\ICD_stroke.dta", replace


* One claim
use "${data_claim}",clear
	keep `id' ${claim_start_date} ${claim_icd9_code}1-${claim_icd9_code}${num_claim_icd9}
	recast long ${claim_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort_stroke
	keep if _merge==3
	drop _merge
	keep if ${claim_start_date}>=`study_start_date' & ${claim_start_date}<=`study_end_date'

	local n=${num_claim_icd9}
	forvalues nclaim9=1(1)`n' {
	   capture replace ${claim_icd9_code}`nclaim9'= subinstr(${claim_icd9_code}`nclaim9', ".", "",.) 
	   }

	gen code="" 
	local n=${num_claim_icd9}
	foreach i in `ICD9_stroke' {
		local len=strlen("`i'")
		forvalues dx=`n'(-1)1 {
			capture replace code="`i'" if substr(${claim_icd9_code}`dx',1,`len') =="`i'" 
		}
	}

	keep if code!=""
	sort `id' ${claim_start_date}, stable

	gen source="claim"
	keep `id' ${claim_start_date} code source
	rename ${claim_start_date} stroke_date
    sort `id' stroke_date
	bysort `id': keep if _n==1


	* Append hospitalizations with claims
	append using "${out_source}\ICD_stroke.dta"
	sort `id' stroke_date, stable
save "${out_source}\ICD_stroke.dta", replace
	
* One most responsible ED accs 
use "${data_accs}",clear
	keep `id' ${accs_start_date} ${accs_icd9_code}1 ${accs_icd10_code}1 ${accs_mis_code}
	*keep if ${accs_mis_code}=="713100000" | ${accs_mis_code}=="713102000" | ${accs_mis_code}=="71310" | ${accs_mis_code}=="7131020"
	gen temp=0
	foreach i in $accs_ed_mis_code {
		replace temp=1 if ${accs_mis_code}=="`i'"
	}
	keep if temp==1
	drop temp
	recast long ${accs_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort_stroke
	keep if _merge==3
	drop _merge
	keep if ${accs_start_date}>=`study_start_date' & ${accs_start_date}<=`study_end_date'

	capture replace ${accs_icd9_code}1= subinstr(${accs_icd9_code}1, ".", "",.) 
	capture replace ${accs_icd10_code}1= subinstr(${accs_icd10_code}1, ".", "",.) 
	  

	gen type=""
	gen code=""	
	foreach i in `ICD9_stroke' {
		local len=strlen("`i'")
			 capture replace code="`i'" if substr(${accs_icd9_code}1,1,`len') =="`i'"
			 capture replace type="${accs_most_responsible}" if substr(${accs_icd9_code}1,1,`len') =="`i'" 
		}

		foreach i in `ICD10_stroke' {
		local len=strlen("`i'")
			capture replace code="`i'" if substr(${accs_icd10_code}1,1,`len') =="`i'" 
			capture replace type="${accs_most_responsible}" if substr(${accs_icd10_code}1,1,`len') =="`i'" 
		}
	
	
	keep if code!=""
	sort `id' ${accs_start_date}, stable
	gen source="accs"
	keep `id' ${accs_start_date} type code source
	bysort `id': keep if _n==1
	rename ${accs_start_date} stroke_date
	
	* Append hospitalizations and claims with accs
	append using "${out_source}\ICD_stroke.dta"
	sort `id' stroke, stable
	bysort `id': keep if _n==1
	keep `id' stroke_date stroke
	sort `id' stroke_date 
save "${out_source}\ICD_stroke.dta", replace

erase cohort_stroke.dta
end