/************************************************************************************************************
This SAS program is designed to assist the NAACCR community in creating local SEER*Stat databases

This program contains 2 sections:

   A - the first reads either NAACCR, NPCR, or SEER Nov22 XML cancer incidence submission files. 
       This section harnesses SAS code and macros written by Fabian Depry, IMS, adds SAS labels, and removes fields from the SAS
       datasets that are 100% missing (blank). This program works with naaccr-xml-utility-9.0+ (https://github.com/imsweb/naaccr-xml/releases)
       If SEER registries want to include IHS linkage results for use in race categorization, they will need to insert the linkage results
       into their submission file first, then run this program.

   B - the second section writes the cancer incidence .CSV files for use in SEER*Prep.

Section A of this program harnesses SAS code for reading NAACCR XML files. For more information, see: 
https://education.naaccr.org/products/sassafras-reading-and-writing-naaccr-v21-xml-using-sas
https://www.naaccr.org/analysis-and-data-improvement-tools/#TRANSLATION

You will need to make changes to each row of the SAS code marked at the end with /* NEED TO CHANGE THIS */

************************************************************************************************************/


*** SECTION A - READ INCIDENCE DATA ***;


*** STEP 1. Use Fabian's tools to Read an XML file into SAS.
*** Specify the locaton of the read_naaccr_xml_macro.sas, the path for the SAS jar file, the source XML data file          ***; 
*** (this can read GZIP or unzipped XML files), and the CSV version of the user-defined data dictionary (if there is one). ***;

/************************************************************************************************************
    Original Comments from Fabian's "read" example:

    This programs demonstrates how to include and call the "read_naaccr_xml_macro.sas" macro.
    This example assumes that the JAR file [Java ARchive] is in the same folder as this program.
    Make sure that the naaccrVersion and recordType are correct or some data items won't be correctly populated.

    Commented out in the code below, this example includes two items, meaning that only those two items will be included in the resulting
    date set. That parameter is optional and if not provided, the data set will contain all standard
    items plus any non-standard items provided via the extra dictionary. Be aware that creating a data set
    containing all items will be MUCH slower than creating one for just a few items, and so if you only need
    a handful of items to do your analysis, it is strongly recommended to provide those items (you can
    check the official NAACCR documentation (http://datadictionary.naaccr.org/default.aspx?Version=22)
    to find the NAACCR XML IDs to use in that list).
    The new parameter added in 8.6, cleanupcsv, can be set to "no" to aid in debugging. The default value is "yes." 
    This example references an extra user-defined dictionary that defines non-standard NAACCR data items. If
    your data file only contains standard data items, that dictionary is not needed. Otherwise the dictionary
    should have been provided by the organization that created the XML data file. Dictionaries are usually
    in XML format, but for technical reasons, the macro expects them in CSV files; the NAACCR XML Tool that
    is distributed with the macros has an option to load a dictionary and save it as CSV. The Word document here
    https://www.naaccr.org/analysis-and-data-improvement-tools/#TRANSLATION describes how to do this in detail.
    - dictfile is the path to an optional user-defined dictionary in CSV format (the NAACCR XML Tool that
        is distributed with the macros has an option to load an XML dictionary and save it as CSV).
        File*Pro can also generate those files. Use spaces to separate multiple paths if you need to
        provide more than one dictionary.
   *** IMPORTANT - The folder path(s) for the user-defined dictionaries cannot contain spaces.
 ************************************************************************************************************/

%include "/[location NAACCR utility extracted]/naaccr-xml-utility-9.0/sas/read_naaccr_xml_macro.sas";           /* NEED TO CHANGE THIS */
%readNaaccrXml(
  libpath="/[location NAACCR utility extracted]/naaccr-xml-utility-9.0/sas/naaccr-xml-9.0-sas.jar",             /* NEED TO CHANGE THIS */
  sourcefile="/[location of your files]/YourData.seer-transmission-nov2022.xml.gz",        											/* NEED TO CHANGE THIS */
  naaccrversion="220",
  recordtype="I",
  dataset=fromxml,
/* items="patientIdNumber,primarySite", */
  dictfile="/[location of your files]/seer-transmission-nov2022-dictionary.csv",                                /* NEED TO CHANGE THIS */
  cleanupcsv="yes"
);

title1 'List of all NAACCR standard data items plus non-standard data items specified in the dictfile';
proc contents data=fromxml position;
run;

*** STEP 2. Create SAS variable labels.                                                                                                ***;
***  a. In the 'filename dicts...' below, put the path and name of the NAACCR 22 base dictionary CSV on the first line. You extracted  ***;
***     this in Step 4 (a) of the Word document instructions or it could have been provided by your registry software or obtained      ***;
***     from NAACCR.org.                                                                                                               ***;
***  b. In the 'filename dicts...' below, put the path and name of the user-defined dictionary CSV that you extracted in Step 4 (b)    ***;
***     of the Word document instructions on the second line or was provided by your software vendor or standard setter. If you do not ***;
***     have a user-defined dictionary, leave this blank.                                                                              ***;
***  c. In the 'filename labels...' below, put the path where you want to store a text file of SAS labels statements. This will be     ***;
***     used in Step 4 to label the SAS variables. Leave the filename as-is, "label-statements.txt".                                   ***;
filename dicts ("/[location of your files]/base-naaccr-dictionary-220.csv"                           			 		/* NEED TO CHANGE THIS */
                "/[location of your files]/seer-transmission-nov2022-dictionary.csv");                		 		/* NEED TO CHANGE THIS */
filename labels "/[location of your files]/label-statements.txt";                                    					/* NEED TO CHANGE THIS */

data dict2labels;
   infile dicts delimiter = ',' MISSOVER DSD lrecl=32767 firstobs=2;
   format NAACCR_XML_ID      $32. 
          NAACCR_Number      best12. 
          Name               $100. 
          Start_Column       best12.
          Length             best12. 
          Record_Types       $20. 
          Parent_XML_Element $20.
          Data_Type          $20. 
          labelstatement     $200.;
   input
      NAACCR_XML_ID      $
      NAACCR_Number
      Name               $
      Start_Column       $
      Length
      Record_Types       $
      Parent_XML_Element $
      Data_Type          $;
   labelstatement = "label "||strip(NAACCR_XML_ID)||" = '"||strip(NAACCR_Number)||"_"||strip(Name)||" - "||strip(NAACCR_XML_ID)||" - "||strip(Parent_XML_Element)||"';";
   if NAACCR_XML_ID='NAACCR XML ID' then delete;
   file labels;
   put labelstatement;
run;
proc print data=dict2labels noobs n;
   var labelstatement;
run;


*** STEP 3. This is the TIDY part of the code that labels the SAS variables and drops variables that are 100% missing. If you looked         ***;
*** at the proc contents results from Step 2, you may have noticed that every single variable in the NAACCR V22 data dictionary is included, ***;
*** even though some of the variables were not included in the data file.                                                                    ***; 

*** The SAS Log will show notes for the variables not in the data file, but that you tried to label, indicating that the variable is         ***;
*** uninitialized, for example "NOTE: Variable textStaging is uninitialized." Don't worry about these notes. :)                              ***;

data xml_labeled;
   set fromxml;

   *** LIMIT TO STATE/CATCHMENT AREA CASES ***;
   where addrAtDxState in ('ID');                                                                                  /* NEED TO CHANGE THIS */        
   %include labels;
run;

* Get list of variables ;
proc transpose data=xml_labeled (obs=0) out=names ;
   var _all_;
run;

* Generate code to count non-missing values ;
filename code temp;
data _null_;
   file code ;
   set names end=eof;
   if _n_=1 then put
 'proc sql noprint;'
/'create table _counts as select' 
/' ' @
  ; else put ',' @ ;
  put 'sum(not missing(' _name_ ')) as ' _name_ ;
  if eof then put
 'from xml_labeled'
/';'
/'quit;'
  ;
run;

* Run generated code ;
%include code / source2 ;

* Generate DROP statement ;
filename code temp;
data _null_;
  set _counts ;
  array c _numeric_;
  file code lrecl=80 ;
  length _name_ $32 ;  
  put 'drop ' @;
  do _n_=1 to dim(c);
   if c(_n_)=0 then do ;
     _name_ = vname(c(_n_));
     put _name_ @ ;
  end;
  end;
  put ';' ;
run;

* Make version of data without empty variables ;
data xml_labeled_tidy ;
  set xml_labeled;
  %include code / source2;
run;

proc contents data=xml_labeled_tidy position;
run;

*** SECTION B - Create new fields for SEER*Prep and WRITE INCIDENCE DATASET FOR SEER*PREP ***;

*** STEP 1 - ADD RECODES FOR SEER*PREP ***;  
proc sort data=xml_labeled_tidy;
   by patientIdNumber recordNumberRecode;
run;

data case_recodes;
   set xml_labeled_tidy;
   by patientIdNumber recordNumberRecode;
   length stateCountyRecode $ 5;

   stateCountyRecode = put(STFIPS(addrAtDxState),z2.) || countyAtDxAnalysis;

   *** Set behavior code to 3 (malignant) for in situ urinary bladder cases ***;
   if behaviorCodeIcdO3 = '2' and primarySite >= 'C670' and primarySite <= 'C679' and not( 
   histologicTypeIcdO3 in ('9050','9051','9052','9053','9054','9055','9140') or (histologicTypeIcdO3 >= '9590' and histologicTypeIcdO3 <= '9993')) then behaviorCodeIcdO3 = '3';

   *** Race1 is overwritten using logic for 4 broad categories plus unknown. This logic is the same as NAACCR and USCS for producing rates by race. ***l
   *** The original race1 values with fine categories are preserved in a new variable, race1Original ***; 
   race1Original = race1;

   *** define seer race recode (w, b, ai/an, api) ***;
       seerracerec = race1;

      *** if white, check race2 ***;
      if seerracerec = '01' and race2 in ('02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','20','21','22','25','26','27','28',
                     '30','31','32','96','97') then seerracerec = race2;       

      *** if white, check ihslink. Apply this logic to all states except Alaska. ***;
      if seerracerec in ('01','98','99') and ihsLink = '1' and addrAtDxState ne 'AK'  then seerracerec = '03';

   *** OVERWRITE RACE1 WITH seerracerec ***;
       race1 = seerracerec;

   *** Define publicReleaseFlag for SEER registries ***;   
   yeardx = substr(dateOfDiagnosis,1,4);
   if '2000' <= yeardx <= '2020' and sex in ('1','2') then publicReleaseFlag = 1;                                                   /* MAY NEED TO CHANGE THIS */
   else publicReleaseFlag = 0;


/*************** CATEGORIZE BREAST CANCER MOLECULAR SUBTYPES (LOGIC WORKS FOR 2011+ CASES) ******************
   C500-C506, C508-C509 :   8000-8700, 8982-8983
   C501-C506, C508-C509 :   8720-8790                                                                      */

   if substr(dateOfDiagnosis,1,4) >= '2011' and behaviorCodeIcdO3 in ('2','3')
   and ((((primarySite >= 'C500' and primarySite <= 'C506') or (primarySite >= 'C508' and primarySite <= 'C509'))
   and ((histologicTypeIcdO3 >= '8000' and histologicTypeIcdO3 <= '8700') or (histologicTypeIcdO3 >= '8982' and histologicTypeIcdO3 <= '8983')))
   or  (((primarySite >= 'C501' and primarySite <= 'C506') or (primarySite >= 'C508' and primarySite <= 'C509'))
   and (histologicTypeIcdO3 >= '8720' and histologicTypeIcdO3 <= '8790'))) then do;
  
      *** 2011-2017 CASES USE csSiteSpecificFactor15 FOR HER2 ***;
      *In data submissions prior to November 2020, borderline ER/PR was classified with positive. Those '30' values are commented out below ***;
      if  substr(dateOfDiagnosis,1,4) >= '2011' and substr(dateOfDiagnosis,1,4) <= '2017' then do; 
         if (csSiteSpecificFactor1=10 /* or csSiteSpecificFactor1=30 */ or csSiteSpecificFactor2=10 /* or csSiteSpecificFactor2=30 */) AND csSiteSpecificFactor15=20 then mol_subtype = 1 ;      *HR+/HER2-;
         else if csSiteSpecificFactor1=20 AND csSiteSpecificFactor2=20 AND csSiteSpecificFactor15=20 then mol_subtype = 2 ;                                *Triple negative;
         else if (csSiteSpecificFactor1=10 /* or csSiteSpecificFactor1=30 */ or csSiteSpecificFactor2=10 /* or csSiteSpecificFactor2=30 */) AND csSiteSpecificFactor15=10 then mol_subtype = 3 ; *HR+/HER2+;
         else if (csSiteSpecificFactor1=20 and csSiteSpecificFactor2=20) AND csSiteSpecificFactor15=10  then mol_subtype= 4 ;                              *HR-/HER2+;
         else if mol_subtype=. then mol_subtype= 5;  
      end;

      *** 2018+ CASES USE SSDIs ***;
      if  substr(dateOfDiagnosis,1,4) >= '2018' then do; 
         if her2OverallSummary='1' then do;
            if estrogenReceptorSummary='1' or progesteroneRecepSummary='1' then mol_subtype = 3 ;       *HR+/HER2+;
            else if estrogenReceptorSummary='0' and progesteroneRecepSummary='0' then mol_subtype= 4 ;  *HR-/HER2+;
            else if mol_subtype=. then mol_subtype= 5;                                                  * UNKNOWN OR BORDERLINE; 
         end;
         else if her2OverallSummary='0' then do;
            if estrogenReceptorSummary='1' or progesteroneRecepSummary='1' then mol_subtype = 1 ;       *HR+/HER2-;
            else if estrogenReceptorSummary='0' and progesteroneRecepSummary='0' then mol_subtype = 2 ; *Triple negative;
            else if mol_subtype=. then mol_subtype= 5;                                                  * UNKNOWN OR BORDERLINE; 
         end;
         else if her2OverallSummary not in ('0','1') then mol_subtype= 5;                               * UNKNOWN; 
      end;
   end;
   else mol_subtype = 9;                                                                                * NOT APPLICABLE;

/*
2018+: HER2 Overall Summary site-specific data item
2011-2017: Collaborative Stage (CS) site-specific factor 15 ("HER2: Summary Result of Testing")
2010: Summary of several CS site-specific factors (described below)
For analysis over time, the following categories are available:

Positive
Negative
Borderline/Unknown*
Not 2010+ Breast
*Due to coding changes in 2018, borderline and unknown are grouped together. Prior to 2018, borderline cases can be isolated using the underlying site-specific factors.

If "Derived HER2 Recode (2010+)" is positive:
If "ER Status Recode Breast Cancer (1990+)" or "PR Status Recode Breast Cancer (1990+)" is positive*, value is Her2+/HR+
If "ER Status Recode Breast Cancer (1990+)" and "PR Status Recode Breast Cancer (1990+)" are both negative, value is Her2+/HR-
Otherwise, value is unknown or borderline*
If "Derived HER2 Recode (2010+)" is negative:
If "ER Status Recode Breast Cancer (1990+)" or "PR Status Recode Breast Cancer (1990+)" is positive*, value is Her2-/HR+
If "ER Status Recode Breast Cancer (1990+)" and "PR Status Recode Breast Cancer (1990+)" are both negative, value is triple negative
Otherwise, value is unknown or borderline*
All other values are unknown

estrogenReceptorSummary
PARENT XML ELEMENT:     Tumor
Description
ER (Estrogen Receptor) Summary is a summary of results of the estrogen receptor (ER) assay.
Rationale
This data item is required for prognostic stage grouping in AJCC 8th edition, Chapter 48, Breast. It was previously collected as Breast CS SSF # 1.
Codes
0     ER negative (0.0% or less than 1%)
1     ER positive
7     Test ordered, results not in chart
9     Not documented in medical record
Cannot be determined (indeterminate)
ER (Estrogen Receptor) Summary status not assessed or unknown if assessed

progesteroneRecepSummary
PARENT XML ELEMENT:     Tumor
Description
PR (Progesterone Receptor) Summary is a summary of results from the progesterone receptor (PR) assay.
Rationale
This data item is required for prognostic stage grouping in AJCC 8th edition, Chapter 48, Breast. It was previously collected as Breast CS SSF # 2.
Codes
0     PR negative (0.0% or less than 1%)
1     PR positive
7     Test ordered, results not in chart
9     Not documented in medical record
Cannot be determined (indeterminate)
PR (Progesterone Receptor) Summary status not assessed or unknown if assessed

her2OverallSummary
0     HER2 negative; equivocal
1     HER2 positive
7     Test ordered, results not in chart
8     Not applicable: Information not collected for this case
(If this item is required by your standard setter, use of code 8 will result in an edit error.)
9     Not documented in medical record
Cannot be determined (indeterminate)
HER2 Overall Summary status not assessed or unknown if assessed
*/


   *** Other restrictions for each registry to consider:
       -Delete out of state cases, if any - none should be included in data submittion files
       -Other? ***;
run;
title1 'Frequency Distribution of Diagnosis Year, recoded race1';
proc freq data=case_recodes;
   tables yeardx;
   tables race1 * seerracerec * race1Original * race2 * ihsLink / list missing;
run;


*** STEP 4. SPLIT AND EXPORT CSV FILES TO BE READ INTO SEER*PREP ***;
data _null_;
   set case_recodes nobs=nobs;
   splitsize = 300000;
   n_splits  = ceil(nobs / splitsize);
   call symput("cnt", (left(put(nobs,best.))));
   call symput("numsplits", (left(put(n_splits,best.))));
   regtxt = "'"||strip(addrAtDxState)||"'";
   call symput("reg", (left(put(regtxt,$4.))));
   regnoq = strip(addrAtDxState);
   call symput("regnoq", (left(put(regnoq,$2.))));
   strtxt = '$'||strip(n_splits*5)||'.';
   call symput("strglen", (left(put(strtxt,$12.)))); 
run;

data fnamestring;
   length fnamestring &strglen.; 
   fnamestring = &reg.||'1';
   do i = 2 to &numsplits.;
      fnamestring = strip(fnamestring)||' '||&reg.||left(put(i,$2.));
   end;
   call symput("dsnames", (left(put(fnamestring,&strglen.))));
run;

data case_recodes2;
   format outtxtstring exptxtstring1-exptxtstring4 $100. exptxtstring $400.;
   set case_recodes;
   by patientIdNumber recordNumberRecode;   
   retain filenum (1) ind (0);
   if (&numsplits > 1 & _n_ > (ind + (&cnt/&numsplits)) & first.patientIdNumber) then do;
      ind = _n_;
      filenum = filenum + 1;
   end;  
   outtxtstring = 'if filenum = '||filenum||' then output '||left(addrAtDxState)||left(filenum)||';';
   exptxtstring1= 'proc export data=work.'||left(addrAtDxState)||left(filenum)||' outfile="';

   *** SPECIFY THE OUTPUT PATH FOR YOUR INCIDENCE CSV FILES.  ***;
   exptxtstring2="/sprj/seerprep/new_dd_files_NAACCR22/webinar/casefiles/";                                    /* NEED TO CHANGE THIS */

   exptxtstring3=left(addrAtDxState)||left(filenum);
   exptxtstring4='.recoded.csv" dbms=csv replace; run;'; 
   exptxtstring = strip(exptxtstring1)||strip(exptxtstring2)||strip(exptxtstring3)||strip(exptxtstring4);
   drop exptxtstring1-exptxtstring4;
run; 
title1 'Number of output CSV files and records per file, outtxtstring exptxtstring';
proc freq data=case_recodes2;
   tables filenum outtxtstring exptxtstring;
run;


*** SPLIT INTO MULTIPLE SAS DATASETS ***;
proc sort data=case_recodes2 out=filetext nodupkey;
   by outtxtstring;
run;
filename toincl temp;
data _null_;
   set filetext;
   file toincl;
   put outtxtstring $;
run;
data &dsnames.;
   set case_recodes2;
   %include toincl;
   drop outtxtstring exptxtstring yeardx seerracerec filenum ind;
run;


*** EXPORT EACH OF THE SAS DATASETS TO CSV FOR USE IN SEERPREP ***;
proc sort data=case_recodes2 out=exporttext nodupkey;
   by exptxtstring;
run;
filename toexp temp;
data _null_;
   set exporttext;
   file toexp;
   put exptxtstring $;
run;
data _null_;
   %include toexp;
run;