/* 
This is stata code for defining Diabetes Mellitus using administrative data
The algorithm is published in Tonelli et al BMC Med Inform Dec Making 2015;15:31
and based on the work in Hux et al Diabetes Care 2002;25:62-69

1)  Specify the file name for the hospitalization data
    Define the variable names for the ICD-9 codes as hosp_icd9_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-9 diagnosis code position as num_hosp_icd9
	Define the variable name for ICD-10 codes as hosp_icd10_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-10 diagnosis code position as num_hosp_icd10
    Define the variable name of the hospitalization start date as hosp_start_date
	Define the variable names of the types of hospitalization as type
	
2)  Specify the file name for the physician claims data 
    Define the variable names for the ICD-9 codes as claim_icd9_codeX (X indicating the diagnosis code position)
	Define the maximum number of ICD-9 diagnosis code position as num_claim_icd9
    Define the variable name of the physician claims date as claim_start_date
   
3)  Specify the directory of the output dataset

4)  Create a study cohort which includes the unique patient identifier, the study 
	start and end dates (e.g., when a patient turns 18 until they die or out-migrate)
*/

* File name and variable names for the hospitalizations dataset
global data_hosp = "G:\Open Data\ICDC 2013\ICDC Source\Constant files\hosp94_2013.dta"
global hosp_icd9_code = "hosp_icd9dx_code"
global num_hosp_icd9 = 16
global hosp_icd10_code = "hosp_icd10dx_code"
global num_hosp_icd10 = 25
global hosp_start_date = "start_date"
global type = "diag_type"
	
* File names and variable names for the physician claims dataset
global data_claim = "G:\Open Data\ICDC 2013\ICDC Source\claims_94_13.dta"
global claim_icd9_code = "hlth_dx_icd9x_code_"
global num_claim_icd9 = 3
global claim_start_date = "start_date"

* Output dataset directory
global out_source = "G:\Projects\Phoebe\MM30codes\kidtran" 

/*
	Sample program call
	ICDmm30_dm cohort akdnid study_start_date study_end_date 
		'cohort' is the file name of the study cohort
		'id' is the unique patient identifier
		'study_start_date' is the study start date
		'study_end_date' is the study end date
*/

version 13.1
capture program drop ICDmm30_dm
program ICDmm30_dm
set more off
gettoken cohort 0:0
gettoken id 0:0
gettoken study_start_date 0:0
gettoken study_end_date 0:0

use `cohort', clear
duplicates drop
sort `id', stable
save cohort_dm, replace



* Diabetes
local ICD9_dm 250
local ICD10_dm E10 E11 E13 E14

* One hospitalization
use "${data_hosp}" , clear
	keep `id' ${hosp_start_date} ${hosp_icd9_code}1-${hosp_icd9_code}${num_hosp_icd9} ${hosp_icd10_code}1-${hosp_icd10_code}${num_hosp_icd10} ///
	${type}1- ${type}16  ${type}17- ${type}${num_hosp_icd10}
	recast long ${hosp_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort_dm
	keep if _merge==3
	drop _merge
	keep if ${hosp_start_date}>=`study_start_date' & ${hosp_start_date}<=`study_end_date'
	
	local n=${num_hosp_icd9}
	forvalues nhos9=1(1)`n' {
	 capture replace ${hosp_icd9_code}`nhos9'= subinstr(${hosp_icd9_code}`nhos9', ".", "",.) 
	   }
	local m=${num_hosp_icd10}	
	forvalues nhos10=1(1)`m' {
	   capture replace ${hosp_icd10_code}`nhos10'= subinstr(${hosp_icd10_code}`nhos10', ".", "",.) 
	   }
	   
	gen type=""
	gen code=""	
	local n=${num_hosp_icd9}
	foreach i in `ICD9_dm'   {
		local len=strlen("`i'")
		forvalues dx=`n'(-1)1 {
			 capture replace code="`i'" if substr(${hosp_icd9_code}`dx',1,`len') =="`i'" 
			 capture  replace type=${type}`dx' if substr(${hosp_icd9_code}`dx',1,`len') =="`i'" 
		}
	}		
	
	local m=${num_hosp_icd10}
	foreach j in `ICD10_dm' {
		local len=strlen("`j'")
		forvalues dx=`m'(-1)1 {			
		   capture replace code = "`j'" if substr(${hosp_icd10_code}`dx',1,`len') == "`j'"
		   capture  replace type=${type}`dx' if substr(${hosp_icd10_code}`dx',1,`len')== "`j'"
		}
	}
	
	keep if code!=""
	keep `id' ${hosp_start_date} type code
	gen source="hosp"
	keep `id' ${hosp_start_date} type code source
	rename ${hosp_start_date} dm_date
	sort `id' dm_date, stable
	bysort `id': keep if _n==1
save "${out_source}\ICD_dm.dta", replace

* Two claims within 2 years
use "${data_claim}",clear
	keep `id' ${claim_start_date} ${claim_icd9_code}1-${claim_icd9_code}${num_claim_icd9}
	recast long ${claim_start_date}, force
	sort `id', stable
	merge n:1 `id' using cohort_dm
	keep if _merge==3
	drop _merge
	keep if ${claim_start_date}>=`study_start_date' & ${claim_start_date}<=`study_end_date'

	local n=${num_claim_icd9}
	forvalues nclaim9=1(1)`n' {
	   capture replace ${claim_icd9_code}`nclaim9'= subinstr(${claim_icd9_code}`nclaim9', ".", "",.) 
	   }

	gen code="" 
	local n=${num_claim_icd9}
	foreach i in `ICD9_dm' {
		local len=strlen("`i'")
		forvalues dx=`n'(-1)1 {
			capture replace code="`i'" if substr(${claim_icd9_code}`dx',1,`len') =="`i'" 
		}
	}
	
	keep if code!=""
	sort `id' ${claim_start_date}, stable
	bysort `id': gen claim=1 if ${claim_start_date}[_n+1]<=(${claim_start_date}[_n]+round(365.25*2,1))
	keep if claim==1
	bysort `id': keep if _n==1
	gen source="claim"
	keep `id' ${claim_start_date}  code source
	rename ${claim_start_date} dm_date

	* Append hospitalizations with claims
	append using "${out_source}\ICD_dm.dta"
	sort `id' dm_date, stable
	bysort `id': keep if _n==1
	gen dm=1
	keep `id' dm_date dm
	sort `id' dm_date 
save "${out_source}\ICD_dm.dta", replace

erase cohort_dm.dta
end