library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Import Data

If you don’t have the data in your working directory, you can download it by uncommenting and running the two lines below.

# download.file("https://git.io/fATNV", "CancerRegistryData.csv")
# download.file("https://git.io/fATNo", "CancerRegistryDataPatients.csv")

cr_patients <- read_csv(
  "CancerRegistryDataPatients.csv",
  col_types = cols(`Date Last Patient Contact/Dead` = col_date(format = "%m/%d/%y"))
)

cr <- read_csv(
  "CancerRegistryData.csv",
  col_types = cols(
    `Date Last Patient Contact/Dead` = col_date(format = "%m/%d/%y"),
    `Date of Diagnosis` = col_date(format = "%m/%d/%y")
  )
)

Task 1

Merge the two data sets to add the patient information from cr_patients to the corresponding diagnosis records in cr

full_join(cr_patients, cr, by = "Medical Record Number")
## # A tibble: 167 x 28
##    `Medical Record… `Age, Current.x` Sex.x `Spanish/Hispan… Race.x
##               <int>            <int> <chr> <chr>            <chr> 
##  1           687391               72 FEMA… NON-SPANISH(0)   WHITE 
##  2           687391               72 FEMA… NON-SPANISH(0)   WHITE 
##  3           906008               76 FEMA… PUERTO RICAN(2)  WHITE 
##  4           162232               74 FEMA… NON-SPANISH(0)   BLACK 
##  5           710246               86 FEMA… NON-SPANISH(0)   WHITE 
##  6           317251               66 FEMA… DOMINICAN REPUB… WHITE 
##  7           175897               62 FEMA… NON-SPANISH(0)   WHITE 
##  8           197932               70 FEMA… NON-SPANISH(0)   WHITE 
##  9           446531               59 FEMA… NON-SPANISH(0)   WHITE 
## 10           446531               59 FEMA… NON-SPANISH(0)   WHITE 
## # ... with 157 more rows, and 23 more variables: `Vital Status.x` <chr>,
## #   `Date Last Patient Contact/Dead.x` <date>, `DC, Cause of
## #   Death.x` <chr>, `Sequence Number` <int>, `Site - Primary
## #   (ICD-O-3)` <chr>, `Histology/Behavior (ICD-O-3)` <chr>, `Date of
## #   Diagnosis` <date>, `Age at Diagnosis` <int>, `Age, Current.y` <int>,
## #   Sex.y <chr>, `Spanish/Hispanic Origin.y` <chr>, Race.y <chr>, `Vital
## #   Status.y` <chr>, `Date Last Patient Contact/Dead.y` <date>, `Survival
## #   Time (months)` <int>, `DC, Cause of Death.y` <chr>, `Clinical Tumor
## #   Size` <dbl>, `Clinical Size Unit` <chr>, `Tobacco Use (FCDS)` <chr>,
## #   `Summary of RX,1st Course` <chr>, `Payer Source 1 at DX` <chr>,
## #   `Patient History, Alcohol` <chr>, `Marital Status at DX` <chr>

Task 2

Find deceased patients at least 75 years or older. Create a data frame containing their patient information and all of their diagnosis records.

task2_patients <- cr_patients %>%
  filter(`Age, Current` >= 75, `Vital Status` == "DEAD")

left_join(task2_patients, cr, by = "Medical Record Number")
## # A tibble: 9 x 28
##   `Medical Record… `Age, Current.x` Sex.x `Spanish/Hispan… Race.x
##              <int>            <int> <chr> <chr>            <chr> 
## 1           710246               86 FEMA… NON-SPANISH(0)   WHITE 
## 2           359703               84 FEMA… PUERTO RICAN(2)  WHITE 
## 3           623039               87 FEMA… NON-SPANISH(0)   WHITE 
## 4           636827               77 FEMA… NON-SPANISH(0)   OTHER 
## 5           563418               75 FEMA… NON-SPANISH(0)   WHITE 
## 6           463338               81 FEMA… NON-SPANISH(0)   WHITE 
## 7           463338               81 FEMA… NON-SPANISH(0)   WHITE 
## 8           424590               90 FEMA… NON-SPANISH(0)   WHITE 
## 9           482228               87 FEMA… NON-SPANISH(0)   WHITE 
## # ... with 23 more variables: `Vital Status.x` <chr>, `Date Last Patient
## #   Contact/Dead.x` <date>, `DC, Cause of Death.x` <chr>, `Sequence
## #   Number` <int>, `Site - Primary (ICD-O-3)` <chr>, `Histology/Behavior
## #   (ICD-O-3)` <chr>, `Date of Diagnosis` <date>, `Age at
## #   Diagnosis` <int>, `Age, Current.y` <int>, Sex.y <chr>,
## #   `Spanish/Hispanic Origin.y` <chr>, Race.y <chr>, `Vital
## #   Status.y` <chr>, `Date Last Patient Contact/Dead.y` <date>, `Survival
## #   Time (months)` <int>, `DC, Cause of Death.y` <chr>, `Clinical Tumor
## #   Size` <dbl>, `Clinical Size Unit` <chr>, `Tobacco Use (FCDS)` <chr>,
## #   `Summary of RX,1st Course` <chr>, `Payer Source 1 at DX` <chr>,
## #   `Patient History, Alcohol` <chr>, `Marital Status at DX` <chr>

Task 3

Find the diagnosis records of the African American patients.

task3_patients <- cr_patients %>%
  filter(Race == "BLACK")

semi_join(cr, task3_patients, by = "Medical Record Number")
## # A tibble: 10 x 21
##    `Medical Record… `Sequence Numbe… `Site - Primary… `Histology/Beha…
##               <int>            <int> <chr>            <chr>           
##  1           162232                0 C503 (BREAST LI… 85003 (INFILTRA…
##  2           502990                0 C508 (BREAST OV… 85003 (INFILTRA…
##  3           263178                0 C504 (BREAST UO… 85233 (INFILTRA…
##  4           690660                0 C504 (BREAST UO… 85003 (INFILTRA…
##  5           428083                0 C508 (BREAST OV… 85003 (INFILTRA…
##  6           336282                0 C504 (BREAST UO… 82113 (TUBULAR …
##  7           529556                0 C509 (BREAST NO… 85003 (INFILTRA…
##  8           178994                0 C505 (BREAST LO… 85003 (INFILTRA…
##  9           320079                0 C504 (BREAST UO… 85003 (INFILTRA…
## 10           528978                0 C505 (BREAST LO… 85203 (LOBULAR …
## # ... with 17 more variables: `Date of Diagnosis` <date>, `Age at
## #   Diagnosis` <int>, `Age, Current` <int>, Sex <chr>, `Spanish/Hispanic
## #   Origin` <chr>, Race <chr>, `Vital Status` <chr>, `Date Last Patient
## #   Contact/Dead` <date>, `Survival Time (months)` <int>, `DC, Cause of
## #   Death` <chr>, `Clinical Tumor Size` <dbl>, `Clinical Size Unit` <chr>,
## #   `Tobacco Use (FCDS)` <chr>, `Summary of RX,1st Course` <chr>, `Payer
## #   Source 1 at DX` <chr>, `Patient History, Alcohol` <chr>, `Marital
## #   Status at DX` <chr>
task3_patients
## # A tibble: 10 x 8
##    `Medical Record… `Age, Current` Sex   `Spanish/Hispan… Race 
##               <int>          <int> <chr> <chr>            <chr>
##  1           162232             74 FEMA… NON-SPANISH(0)   BLACK
##  2           502990             75 FEMA… PUERTO RICAN(2)  BLACK
##  3           263178             64 FEMA… NON-SPANISH(0)   BLACK
##  4           690660             51 FEMA… NON-SPANISH(0)   BLACK
##  5           428083             88 FEMA… NON-SPANISH(0)   BLACK
##  6           336282             72 FEMA… UNKNOWN(9)       BLACK
##  7           529556             63 FEMA… NON-SPANISH(0)   BLACK
##  8           178994             46 FEMA… NON-SPANISH(0)   BLACK
##  9           320079             66 FEMA… NON-SPANISH(0)   BLACK
## 10           528978             57 FEMA… NON-SPANISH(0)   BLACK
## # ... with 3 more variables: `Vital Status` <chr>, `Date Last Patient
## #   Contact/Dead` <date>, `DC, Cause of Death` <chr>
semi_join(cr, task3_patients, by = "Medical Record Number") %>%
  group_by(`Medical Record Number`) %>% count()
## # A tibble: 10 x 2
## # Groups:   Medical Record Number [10]
##    `Medical Record Number`     n
##                      <int> <int>
##  1                  162232     1
##  2                  178994     1
##  3                  263178     1
##  4                  320079     1
##  5                  336282     1
##  6                  428083     1
##  7                  502990     1
##  8                  528978     1
##  9                  529556     1
## 10                  690660     1

BONUS: Find the diagnosis records of married African American patients using two joins.

task3_married <- cr %>%
  filter(`Marital Status at DX` == "MARRIED") %>%
  select(`Medical Record Number`, `Marital Status at DX`) %>%
  distinct()

semi_join(cr, task3_patients, by = "Medical Record Number") %>%
  semi_join(task3_married)
## Joining, by = c("Medical Record Number", "Marital Status at DX")
## # A tibble: 6 x 21
##   `Medical Record… `Sequence Numbe… `Site - Primary… `Histology/Beha…
##              <int>            <int> <chr>            <chr>           
## 1           162232                0 C503 (BREAST LI… 85003 (INFILTRA…
## 2           263178                0 C504 (BREAST UO… 85233 (INFILTRA…
## 3           336282                0 C504 (BREAST UO… 82113 (TUBULAR …
## 4           178994                0 C505 (BREAST LO… 85003 (INFILTRA…
## 5           320079                0 C504 (BREAST UO… 85003 (INFILTRA…
## 6           528978                0 C505 (BREAST LO… 85203 (LOBULAR …
## # ... with 17 more variables: `Date of Diagnosis` <date>, `Age at
## #   Diagnosis` <int>, `Age, Current` <int>, Sex <chr>, `Spanish/Hispanic
## #   Origin` <chr>, Race <chr>, `Vital Status` <chr>, `Date Last Patient
## #   Contact/Dead` <date>, `Survival Time (months)` <int>, `DC, Cause of
## #   Death` <chr>, `Clinical Tumor Size` <dbl>, `Clinical Size Unit` <chr>,
## #   `Tobacco Use (FCDS)` <chr>, `Summary of RX,1st Course` <chr>, `Payer
## #   Source 1 at DX` <chr>, `Patient History, Alcohol` <chr>, `Marital
## #   Status at DX` <chr>

Task 4

First, verify that all patients in the diagnosis records data set cr appear in the patient information data set using anti_join

Then, find all patients who have had more than one cancer diagnosis and create a data frame with only their patient information, using semi_join().

anti_join(cr_patients, cr, by = "Medical Record Number")
## # A tibble: 0 x 8
## # ... with 8 variables: `Medical Record Number` <int>, `Age,
## #   Current` <int>, Sex <chr>, `Spanish/Hispanic Origin` <chr>,
## #   Race <chr>, `Vital Status` <chr>, `Date Last Patient
## #   Contact/Dead` <date>, `DC, Cause of Death` <chr>
task4_patients <- cr %>%
  group_by(`Medical Record Number`) %>%
  count() %>%
  filter(n > 1)

semi_join(cr, task4_patients, by = "Medical Record Number")
## # A tibble: 34 x 21
##    `Medical Record… `Sequence Numbe… `Site - Primary… `Histology/Beha…
##               <int>            <int> <chr>            <chr>           
##  1           687391                1 C509 (BREAST NO… 85202 (LOBULAR …
##  2           446531                1 C504 (BREAST UO… 85223 (INFILTRA…
##  3           334557                1 C504 (BREAST UO… 85003 (INFILTRA…
##  4           341050                1 C569 (OVARY)     84603 (PAPILLAR…
##  5           896356                2 C182 (COLON ASC… 81403 (ADENOCAR…
##  6           896356                1 C509 (BREAST NO… 85203 (LOBULAR …
##  7           558013                1 C649 (KIDNEY NO… 83123 (RENAL CE…
##  8           124227                0 C501 (BREAST CE… 85003 (INFILTRA…
##  9           341050                2 C508 (BREAST OV… 85003 (INFILTRA…
## 10           984584                1 C503 (BREAST LI… 85003 (INFILTRA…
## # ... with 24 more rows, and 17 more variables: `Date of
## #   Diagnosis` <date>, `Age at Diagnosis` <int>, `Age, Current` <int>,
## #   Sex <chr>, `Spanish/Hispanic Origin` <chr>, Race <chr>, `Vital
## #   Status` <chr>, `Date Last Patient Contact/Dead` <date>, `Survival Time
## #   (months)` <int>, `DC, Cause of Death` <chr>, `Clinical Tumor
## #   Size` <dbl>, `Clinical Size Unit` <chr>, `Tobacco Use (FCDS)` <chr>,
## #   `Summary of RX,1st Course` <chr>, `Payer Source 1 at DX` <chr>,
## #   `Patient History, Alcohol` <chr>, `Marital Status at DX` <chr>

Task 5

Find patients who were last contacted in 2017 and merge their patient information with their diagnosis records. Transform all dates to be the number of days since their earliest diagnosis.

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
# Find patients last contacted in 2017
task5_selected <- cr_patients %>%
  filter(year(`Date Last Patient Contact/Dead`) == 2017)

# Create a data frame with patient and earliest diagnosis
task5_first_dx <- cr %>%
  group_by(`Medical Record Number`) %>%
  summarize(first_diagnosis = min(`Date of Diagnosis`))

# Merge task5_selected and task5_first_dx so that selected patients also have
# a `first_diagnosis` column
task5_patients <- left_join(task5_selected, task5_first_dx, by = "Medical Record Number") %>%
  select(-`Date Last Patient Contact/Dead`)

task5_patients %>%
  select(`Medical Record Number`, first_diagnosis)
## # A tibble: 29 x 2
##    `Medical Record Number` first_diagnosis
##                      <int> <date>         
##  1                  687391 1991-02-20     
##  2                  446531 1996-12-02     
##  3                  148456 1998-10-25     
##  4                  934875 2001-11-02     
##  5                  644232 2005-09-06     
##  6                  789242 2006-04-29     
##  7                  348966 2006-09-24     
##  8                  487851 2009-03-03     
##  9                  158774 2009-09-11     
## 10                  524664 2010-03-24     
## # ... with 19 more rows
# Then join patient information and diagnosis records and transform dates.
full_join(task5_patients, cr, by = "Medical Record Number") %>%
  mutate(
    `Date Last Patient Contact/Dead` = `Date Last Patient Contact/Dead` - first_diagnosis,
    `Date Last Patient Contact/Dead` = as.integer(`Date Last Patient Contact/Dead`),
    `Date of Diagnosis` = `Date of Diagnosis` - first_diagnosis,
    `Date of Diagnosis` = as.integer(`Date of Diagnosis`),
  ) %>%
  select(`Medical Record Number`, starts_with("date"))
## # A tibble: 167 x 3
##    `Medical Record Numbe… `Date of Diagnosis` `Date Last Patient Contact/…
##                     <int>               <int>                        <int>
##  1                 687391                   0                         9678
##  2                 687391                6838                         9678
##  3                 446531                   0                         7358
##  4                 446531                5529                         7358
##  5                 148456                   0                         6864
##  6                 934875                   0                         5646
##  7                 644232                   0                         4169
##  8                 789242                   0                         3907
##  9                 348966                   0                         4037
## 10                 487851                   0                         3128
## # ... with 157 more rows