library(tidyverse)
library(readxl)
library(readr)

Import Data

Copied from RStudio’s “Import Dataset…”

cr_patients <- read_excel("CancerRegistryDataPatients.xlsx",
                          col_types = c("numeric", "numeric", "text",
                                        "text", "text", "text", "date", "text"))

cr_patients <- read_csv("CancerRegistryDataPatients.csv",
                        col_types = cols(`Date Last Patient Contact/Dead` = col_date(format = "%m/%d/%y")))

View Data

# View(cr_patients)
glimpse(cr_patients)
Observations: 150
Variables: 8
$ `Medical Record Number`          <int> 687391, 906008, 162232, 71024...
$ `Age, Current`                   <int> 72, 76, 74, 86, 66, 62, 70, 5...
$ Sex                              <chr> "FEMALE", "FEMALE", "FEMALE",...
$ `Spanish/Hispanic Origin`        <chr> "NON-SPANISH(0)", "PUERTO RIC...
$ Race                             <chr> "WHITE", "WHITE", "BLACK", "W...
$ `Vital Status`                   <chr> "ALIVE", "ALIVE", "ALIVE", "D...
$ `Date Last Patient Contact/Dead` <date> 2017-08-20, 2018-01-15, 2016...
$ `DC, Cause of Death`             <chr> "0", NA, NA, "C509", "C509", ...
head(cr_patients)
# A tibble: 6 x 8
  `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
             <int>          <int> <chr> <chr>            <chr>
1           687391             72 FEMA… NON-SPANISH(0)   WHITE
2           906008             76 FEMA… PUERTO RICAN(2)  WHITE
3           162232             74 FEMA… NON-SPANISH(0)   BLACK
4           710246             86 FEMA… NON-SPANISH(0)   WHITE
5           317251             66 FEMA… DOMINICAN REPUB… WHITE
6           175897             62 FEMA… NON-SPANISH(0)   WHITE
# ... with 3 more variables: `Vital Status` <chr>, `Date Last Patient
#   Contact/Dead` <date>, `DC, Cause of Death` <chr>
tail(cr_patients)
# A tibble: 6 x 8
  `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
             <int>          <int> <chr> <chr>            <chr>
1           240959             51 FEMA… SPANISH NOS(6)   WHITE
2           446950             70 FEMA… NON-SPANISH(0)   WHITE
3           884615             47 FEMA… SPANISH NOS(6)   WHITE
4           712493             66 FEMA… NON-SPANISH(0)   OTHER
5           757065             80 FEMA… NON-SPANISH(0)   WHITE
6           901534             51 FEMA… NON-SPANISH(0)   WHITE
# ... with 3 more variables: `Vital Status` <chr>, `Date Last Patient
#   Contact/Dead` <date>, `DC, Cause of Death` <chr>
nrow(cr_patients)
[1] 150
names(cr_patients)
[1] "Medical Record Number"          "Age, Current"                  
[3] "Sex"                            "Spanish/Hispanic Origin"       
[5] "Race"                           "Vital Status"                  
[7] "Date Last Patient Contact/Dead" "DC, Cause of Death"            

How many of each?

How many of each: Sex?

count(cr_patients, Sex)
# A tibble: 1 x 2
  Sex        n
  <chr>  <int>
1 FEMALE   150

How many of each: Spanish/Hispanic Origin?

count(cr_patients, `Spanish/Hispanic Origin`)
# A tibble: 6 x 2
  `Spanish/Hispanic Origin`     n
  <chr>                     <int>
1 DOMINICAN REPUBLIC(8)         1
2 MEXICAN(1)                    2
3 NON-SPANISH(0)              135
4 PUERTO RICAN(2)               5
5 SPANISH NOS(6)                6
6 UNKNOWN(9)                    1

How many of each: Race?

count(cr_patients, Race)
# A tibble: 3 x 2
  Race      n
  <chr> <int>
1 BLACK    10
2 OTHER     9
3 WHITE   131

How many of each: Vital Status + DC, Cause of Death?

count(cr_patients, `Vital Status`, `DC, Cause of Death`)
# A tibble: 5 x 3
  `Vital Status` `DC, Cause of Death`     n
  <chr>          <chr>                <int>
1 ALIVE          0                      103
2 ALIVE          <NA>                    16
3 DEAD           7777                     1
4 DEAD           C509                    28
5 NONE           0                        2

Rename

To make things easier, rename the columns representing cause_of_death and vital_status.

cr_patients <- cr_patients %>%
  rename(cause_of_death = `DC, Cause of Death`,
         vital_status = `Vital Status`)
cr_patients
# A tibble: 150 x 8
   `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
              <int>          <int> <chr> <chr>            <chr>
 1           687391             72 FEMA… NON-SPANISH(0)   WHITE
 2           906008             76 FEMA… PUERTO RICAN(2)  WHITE
 3           162232             74 FEMA… NON-SPANISH(0)   BLACK
 4           710246             86 FEMA… NON-SPANISH(0)   WHITE
 5           317251             66 FEMA… DOMINICAN REPUB… WHITE
 6           175897             62 FEMA… NON-SPANISH(0)   WHITE
 7           197932             70 FEMA… NON-SPANISH(0)   WHITE
 8           446531             59 FEMA… NON-SPANISH(0)   WHITE
 9           148456             84 FEMA… SPANISH NOS(6)   WHITE
10           874768             58 FEMA… NON-SPANISH(0)   WHITE
# ... with 140 more rows, and 3 more variables: vital_status <chr>, `Date
#   Last Patient Contact/Dead` <date>, cause_of_death <chr>

Fix Coding Issues

Create a new variable called hispanic that is FALSE if Spanish/Hispanic Origin is not equal to NON-SPANISH(0) or UNKNOWN(9).

cr_patients <- cr_patients %>%
  mutate(hispanic = !(`Spanish/Hispanic Origin` %in% c("NON-SPANISH(0)", "UNKNOWN(9)")))
cr_patients
# A tibble: 150 x 9
   `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
              <int>          <int> <chr> <chr>            <chr>
 1           687391             72 FEMA… NON-SPANISH(0)   WHITE
 2           906008             76 FEMA… PUERTO RICAN(2)  WHITE
 3           162232             74 FEMA… NON-SPANISH(0)   BLACK
 4           710246             86 FEMA… NON-SPANISH(0)   WHITE
 5           317251             66 FEMA… DOMINICAN REPUB… WHITE
 6           175897             62 FEMA… NON-SPANISH(0)   WHITE
 7           197932             70 FEMA… NON-SPANISH(0)   WHITE
 8           446531             59 FEMA… NON-SPANISH(0)   WHITE
 9           148456             84 FEMA… SPANISH NOS(6)   WHITE
10           874768             58 FEMA… NON-SPANISH(0)   WHITE
# ... with 140 more rows, and 4 more variables: vital_status <chr>, `Date
#   Last Patient Contact/Dead` <date>, cause_of_death <chr>,
#   hispanic <lgl>

Recode vital_status: replace the value "NONE" with a missing value. To do this, you can use the helper function na_if() from dplyr.

cr_patients <- cr_patients %>%
  mutate(vital_status = na_if(vital_status, "NONE"))
cr_patients
# A tibble: 150 x 9
   `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
              <int>          <int> <chr> <chr>            <chr>
 1           687391             72 FEMA… NON-SPANISH(0)   WHITE
 2           906008             76 FEMA… PUERTO RICAN(2)  WHITE
 3           162232             74 FEMA… NON-SPANISH(0)   BLACK
 4           710246             86 FEMA… NON-SPANISH(0)   WHITE
 5           317251             66 FEMA… DOMINICAN REPUB… WHITE
 6           175897             62 FEMA… NON-SPANISH(0)   WHITE
 7           197932             70 FEMA… NON-SPANISH(0)   WHITE
 8           446531             59 FEMA… NON-SPANISH(0)   WHITE
 9           148456             84 FEMA… SPANISH NOS(6)   WHITE
10           874768             58 FEMA… NON-SPANISH(0)   WHITE
# ... with 140 more rows, and 4 more variables: vital_status <chr>, `Date
#   Last Patient Contact/Dead` <date>, cause_of_death <chr>,
#   hispanic <lgl>

Fix the following two issues with cause_of_death using a single mutate().

  1. The death certificate was not available for one patient (who was coded as "7777"), but the certificate has been found and the code should be "C508".

  2. If the patient is still alive but cause of death is missing, then the value should be "0".

cr_patients <- cr_patients %>%
  mutate(
    cause_of_death = recode(cause_of_death, "7777" = "C508"),
    cause_of_death = if_else(is.na(cause_of_death) & vital_status == "ALIVE", "0", cause_of_death)
  )
cr_patients
# A tibble: 150 x 9
   `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
              <int>          <int> <chr> <chr>            <chr>
 1           687391             72 FEMA… NON-SPANISH(0)   WHITE
 2           906008             76 FEMA… PUERTO RICAN(2)  WHITE
 3           162232             74 FEMA… NON-SPANISH(0)   BLACK
 4           710246             86 FEMA… NON-SPANISH(0)   WHITE
 5           317251             66 FEMA… DOMINICAN REPUB… WHITE
 6           175897             62 FEMA… NON-SPANISH(0)   WHITE
 7           197932             70 FEMA… NON-SPANISH(0)   WHITE
 8           446531             59 FEMA… NON-SPANISH(0)   WHITE
 9           148456             84 FEMA… SPANISH NOS(6)   WHITE
10           874768             58 FEMA… NON-SPANISH(0)   WHITE
# ... with 140 more rows, and 4 more variables: vital_status <chr>, `Date
#   Last Patient Contact/Dead` <date>, cause_of_death <chr>,
#   hispanic <lgl>

Task 1

Task 1-A

For this task, the following request was made:

  1. Caucasian, non-hispanic patients,
  2. who are still alive
  3. at least 55 or older
  4. ordered by most recent contact
  5. with only the columns MRN, Age, and Last Contact
#task1 <-
cr_patients %>%
  filter(Race == "WHITE",
         vital_status == "ALIVE",
         `Age, Current` >= 55) %>%
  arrange(desc(`Date Last Patient Contact/Dead`)) %>%
  select(MRN = `Medical Record Number`,
         Age = `Age, Current`,
         `Last Contact` = `Date Last Patient Contact/Dead`)
# A tibble: 84 x 3
      MRN   Age `Last Contact`
    <int> <int> <date>        
 1 654223    57 2019-01-30    
 2 780544    67 2019-01-20    
 3 175897    62 2018-12-29    
 4 344727    70 2018-12-29    
 5 492119    59 2018-12-18    
 6 151416    62 2018-11-17    
 7 820782    87 2018-10-12    
 8 523329    67 2018-08-25    
 9 194777    71 2018-08-08    
10 363994    69 2018-07-11    
# ... with 74 more rows

Create a folder called “output_04” in your project and save the output into the folder as a CSV file using write_csv().

write_csv("output_04/task1a.csv")

Task 1-B

Copy the code from Task 1-A and then modify it to include only patients who were contacted since January 1, 2017. Export your results to a CSV file.

#task1b <-
cr_patients %>%
  filter(Race == "WHITE",
         vital_status == "ALIVE",
         `Age, Current` >= 55,
         `Date Last Patient Contact/Dead` >= "2017-01-01") %>%
  arrange(desc(`Date Last Patient Contact/Dead`)) %>%
  select(MRN = `Medical Record Number`,
         Age = `Age, Current`,
         `Last Contact` = `Date Last Patient Contact/Dead`)
# A tibble: 36 x 3
      MRN   Age `Last Contact`
    <int> <int> <date>        
 1 654223    57 2019-01-30    
 2 780544    67 2019-01-20    
 3 175897    62 2018-12-29    
 4 344727    70 2018-12-29    
 5 492119    59 2018-12-18    
 6 151416    62 2018-11-17    
 7 820782    87 2018-10-12    
 8 523329    67 2018-08-25    
 9 194777    71 2018-08-08    
10 363994    69 2018-07-11    
# ... with 26 more rows

Task 2

Task 2-A

Calculate the average age of patients by race, vital status, and hispanic origin. Again, save your results as a CSV file.

# task2a <-
cr_patients %>%
  group_by(Race, vital_status, hispanic) %>%
  summarize(avg_age = mean(`Age, Current`))
# A tibble: 13 x 4
# Groups:   Race, vital_status [?]
   Race  vital_status hispanic avg_age
   <chr> <chr>        <lgl>      <dbl>
 1 BLACK ALIVE        FALSE       70.2
 2 BLACK ALIVE        TRUE        75  
 3 BLACK DEAD         FALSE       53.3
 4 OTHER ALIVE        FALSE       70  
 5 OTHER ALIVE        TRUE        75  
 6 OTHER DEAD         FALSE       77  
 7 OTHER DEAD         TRUE        53  
 8 OTHER <NA>         FALSE       66  
 9 WHITE ALIVE        FALSE       63.8
10 WHITE ALIVE        TRUE        63.9
11 WHITE DEAD         FALSE       66.5
12 WHITE DEAD         TRUE        70.7
13 WHITE <NA>         FALSE       80  

Task 2-B

In addition to the average age, report the minimum, maximum and count of patients in each group from Task 2-A. For the count, this time you can use the function n().

# task2b <-
cr_patients %>%
  group_by(Race, vital_status, hispanic) %>%
  summarize(avg_age = mean(`Age, Current`),
            min_age = min(`Age, Current`),
            max_age = max(`Age, Current`),
            n = n())
# A tibble: 13 x 7
# Groups:   Race, vital_status [?]
   Race  vital_status hispanic avg_age min_age max_age     n
   <chr> <chr>        <lgl>      <dbl>   <dbl>   <dbl> <int>
 1 BLACK ALIVE        FALSE       70.2      57      88     6
 2 BLACK ALIVE        TRUE        75        75      75     1
 3 BLACK DEAD         FALSE       53.3      46      63     3
 4 OTHER ALIVE        FALSE       70        50      84     4
 5 OTHER ALIVE        TRUE        75        68      82     2
 6 OTHER DEAD         FALSE       77        77      77     1
 7 OTHER DEAD         TRUE        53        53      53     1
 8 OTHER <NA>         FALSE       66        66      66     1
 9 WHITE ALIVE        FALSE       63.8      24      90    99
10 WHITE ALIVE        TRUE        63.9      47      84     7
11 WHITE DEAD         FALSE       66.5      37      90    21
12 WHITE DEAD         TRUE        70.7      62      84     3
13 WHITE <NA>         FALSE       80        80      80     1

Task 2-C

Calculate the average age of patients by year of last contact. To do this, you can load the lubridate package and use the function year() on any columns with dates.

library(lubridate)

Attaching package: 'lubridate'
The following object is masked from 'package:base':

    date
#task2c <-
cr_patients %>%
  mutate(year = year(`Date Last Patient Contact/Dead`)) %>%
  group_by(year) %>%
  summarize(avg_age = mean(`Age, Current`))
# A tibble: 9 x 2
   year avg_age
  <dbl>   <dbl>
1  2011    72  
2  2012    62.6
3  2013    68  
4  2014    65.3
5  2015    67.4
6  2016    65.4
7  2017    61.4
8  2018    64.7
9  2019    58.3

Task 3

Export a CSV containing the following information:

  1. Patients who
    • Are deceased
    • At least 55 years or older
  2. Race recoded as “Caucasian” or “Other”
  3. Ordered by MRN
  4. With the columns
    • MRN
    • Age
    • Race
    • CoD
#task3 <-
cr_patients %>%
  filter(vital_status == "DEAD", `Age, Current` >= 55) %>%
  arrange(`Medical Record Number`) %>%
  select(MRN = `Medical Record Number`, Age = `Age, Current`, Race, CoD = cause_of_death) %>%
  mutate(Race = if_else(Race == "WHITE", "Caucasian", "Other"))
# A tibble: 21 x 4
      MRN   Age Race      CoD  
    <int> <int> <chr>     <chr>
 1 317251    66 Caucasian C509 
 2 347062    62 Caucasian C509 
 3 359703    84 Caucasian C509 
 4 385615    72 Caucasian C509 
 5 390671    60 Caucasian C509 
 6 414593    58 Caucasian C509 
 7 423286    67 Caucasian C509 
 8 424101    63 Caucasian C509 
 9 424590    90 Caucasian C509 
10 435037    66 Caucasian C509 
# ... with 11 more rows

Bonus Challenge

Read in the data file CancerRegistryData.csv and answer the following questions:

In which year were the most patients diagnosed?

What is the average, minimum and maximum age at diagnosis in each year?

Do all of the values from the previous question make sense? If not, fix or remove the patients with bad values in the next step.

What is the average age at diagnosis, average survival time in years and number of patients diagnosed for each primary cancer site, ordered by prevalence?

CancerRegistryData <- read_csv("CancerRegistryData.csv",
    col_types = cols(`Date Last Patient Contact/Dead` = col_datetime(format = "%m/%d/%y"),
        `Date of Diagnosis` = col_datetime(format = "%m/%d/%y")))


CancerRegistryData %>%
  mutate(year = lubridate::year(`Date of Diagnosis`)) %>%
  count(year) %>%
  arrange(desc(n))
# A tibble: 24 x 2
    year     n
   <dbl> <int>
 1  2013    21
 2  2011    19
 3  2012    19
 4  2014    16
 5  2010    14
 6  2009    11
 7  2015    11
 8  2008     8
 9  2016     6
10  2006     5
11  2007     5
12    NA     5
13  2005     4
14  1991     3
15  1996     3
16  2001     3
17  2002     3
18  1998     2
19  2000     2
20  2003     2
21  2004     2
22  1992     1
23  1993     1
24  1999     1
CancerRegistryData %>%
  mutate(year = lubridate::year(`Date of Diagnosis`)) %>%
  group_by(year) %>%
  summarize(age = mean(`Age at Diagnosis`),
            min = min(`Age at Diagnosis`),
            max = max(`Age at Diagnosis`))
# A tibble: 24 x 4
    year   age   min   max
   <dbl> <dbl> <dbl> <dbl>
 1  1991  47.7    45    49
 2  1992  65      65    65
 3  1993  43      43    43
 4  1996  43.3    39    52
 5  1998  52      40    64
 6  1999  71      71    71
 7  2000  57      53    61
 8  2001  54.7    40    71
 9  2002  51.7    38    60
10  2003  59      59    59
11  2004  51      46    56
12  2005  57.5    45    74
13  2006  64.2    51    80
14  2007  58      54    64
15  2008  63.5    46    81
16  2009  52.7    37    70
17  2010  56.4    29    79
18  2011  58      30    83
19  2012  60.9    35    89
20  2013  60.4    36    85
21  2014  57.6    22    77
22  2015  61.6    34    84
23  2016  61      46    85
24    NA 619.     41   999
CancerRegistryData %>%
  filter(!is.na(`Sequence Number`)) %>%
  group_by(`Site - Primary (ICD-O-3)`) %>%
  summarize(
    age = mean(`Age at Diagnosis`),
    survival_time = mean(`Survival Time (months)`),
    n = n()
  ) %>%
  mutate(survival_time = survival_time / 12) %>%
  arrange(desc(n))
# A tibble: 19 x 4
   `Site - Primary (ICD-O-3)`         age survival_time     n
   <chr>                            <dbl>         <dbl> <int>
 1 C504 (BREAST UOQ)                 55.6         6.42     45
 2 C508 (BREAST OVERLAPPING LESION)  60.0         5.54     33
 3 C509 (BREAST NOS)                 59.3         8.65     26
 4 C502 (BREAST UIQ)                 58.8         4.76     18
 5 C503 (BREAST LIQ)                 51.8         7.46     13
 6 C501 (BREAST CENTRAL)             60.5         5.65      6
 7 C505 (BREAST LOQ)                 54.3         3.81      6
 8 C541 (ENDOMETRIUM)                74.2         2.5       4
 9 C569 (OVARY)                      67.5         8.08      2
10 C182 (COLON ASCENDING)            59          12         1
11 C383 (MEDIASTINUM NOS)            22           1.75      1
12 C421 (BONE MARROW)                62           4.42      1
13 C506 (BREAST TAIL)                46           3.25      1
14 C529 (VAGINA)                     65           3.25      1
15 C649 (KIDNEY NOS)                 58          11.2       1
16 C700 (CRANIAL MENINGES)           78           3.17      1
17 C739 (THYROID GLAND)              54           5.08      1
18 C778 (LYMPH NODES MULTIPLE)       77           0.833     1
19 C809 (UNKNOWN PRIMARY SITE)       63           0.333     1