library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
If you don’t have the data in your working directory, you can download it by uncommenting and running the two lines below.
# download.file("https://git.io/fATNV", "CancerRegistryData.csv")
# download.file("https://git.io/fATNo", "CancerRegistryDataPatients.csv")
cr_patients <- read_csv(
"CancerRegistryDataPatients.csv",
col_types = cols(`Date Last Patient Contact/Dead` = col_date(format = "%m/%d/%y"))
)
cr <- read_csv(
"CancerRegistryData.csv",
col_types = cols(
`Date Last Patient Contact/Dead` = col_date(format = "%m/%d/%y"),
`Date of Diagnosis` = col_date(format = "%m/%d/%y")
)
)
Merge the two data sets to add the patient information from cr_patients
to the corresponding diagnosis records in cr
full_join(cr_patients, cr, by = "Medical Record Number")
## # A tibble: 167 x 28
## `Medical Record… `Age, Current.x` Sex.x `Spanish/Hispan… Race.x
## <int> <int> <chr> <chr> <chr>
## 1 687391 72 FEMA… NON-SPANISH(0) WHITE
## 2 687391 72 FEMA… NON-SPANISH(0) WHITE
## 3 906008 76 FEMA… PUERTO RICAN(2) WHITE
## 4 162232 74 FEMA… NON-SPANISH(0) BLACK
## 5 710246 86 FEMA… NON-SPANISH(0) WHITE
## 6 317251 66 FEMA… DOMINICAN REPUB… WHITE
## 7 175897 62 FEMA… NON-SPANISH(0) WHITE
## 8 197932 70 FEMA… NON-SPANISH(0) WHITE
## 9 446531 59 FEMA… NON-SPANISH(0) WHITE
## 10 446531 59 FEMA… NON-SPANISH(0) WHITE
## # ... with 157 more rows, and 23 more variables: `Vital Status.x` <chr>,
## # `Date Last Patient Contact/Dead.x` <date>, `DC, Cause of
## # Death.x` <chr>, `Sequence Number` <int>, `Site - Primary
## # (ICD-O-3)` <chr>, `Histology/Behavior (ICD-O-3)` <chr>, `Date of
## # Diagnosis` <date>, `Age at Diagnosis` <int>, `Age, Current.y` <int>,
## # Sex.y <chr>, `Spanish/Hispanic Origin.y` <chr>, Race.y <chr>, `Vital
## # Status.y` <chr>, `Date Last Patient Contact/Dead.y` <date>, `Survival
## # Time (months)` <int>, `DC, Cause of Death.y` <chr>, `Clinical Tumor
## # Size` <dbl>, `Clinical Size Unit` <chr>, `Tobacco Use (FCDS)` <chr>,
## # `Summary of RX,1st Course` <chr>, `Payer Source 1 at DX` <chr>,
## # `Patient History, Alcohol` <chr>, `Marital Status at DX` <chr>
Find deceased patients at least 75 years or older. Create a data frame containing their patient information and all of their diagnosis records.
task2_patients <- cr_patients %>%
filter(`Age, Current` >= 75, `Vital Status` == "DEAD")
left_join(task2_patients, cr, by = "Medical Record Number")
## # A tibble: 9 x 28
## `Medical Record… `Age, Current.x` Sex.x `Spanish/Hispan… Race.x
## <int> <int> <chr> <chr> <chr>
## 1 710246 86 FEMA… NON-SPANISH(0) WHITE
## 2 359703 84 FEMA… PUERTO RICAN(2) WHITE
## 3 623039 87 FEMA… NON-SPANISH(0) WHITE
## 4 636827 77 FEMA… NON-SPANISH(0) OTHER
## 5 563418 75 FEMA… NON-SPANISH(0) WHITE
## 6 463338 81 FEMA… NON-SPANISH(0) WHITE
## 7 463338 81 FEMA… NON-SPANISH(0) WHITE
## 8 424590 90 FEMA… NON-SPANISH(0) WHITE
## 9 482228 87 FEMA… NON-SPANISH(0) WHITE
## # ... with 23 more variables: `Vital Status.x` <chr>, `Date Last Patient
## # Contact/Dead.x` <date>, `DC, Cause of Death.x` <chr>, `Sequence
## # Number` <int>, `Site - Primary (ICD-O-3)` <chr>, `Histology/Behavior
## # (ICD-O-3)` <chr>, `Date of Diagnosis` <date>, `Age at
## # Diagnosis` <int>, `Age, Current.y` <int>, Sex.y <chr>,
## # `Spanish/Hispanic Origin.y` <chr>, Race.y <chr>, `Vital
## # Status.y` <chr>, `Date Last Patient Contact/Dead.y` <date>, `Survival
## # Time (months)` <int>, `DC, Cause of Death.y` <chr>, `Clinical Tumor
## # Size` <dbl>, `Clinical Size Unit` <chr>, `Tobacco Use (FCDS)` <chr>,
## # `Summary of RX,1st Course` <chr>, `Payer Source 1 at DX` <chr>,
## # `Patient History, Alcohol` <chr>, `Marital Status at DX` <chr>
Find the diagnosis records of the African American patients.
task3_patients <- cr_patients %>%
filter(Race == "BLACK")
semi_join(cr, task3_patients, by = "Medical Record Number")
## # A tibble: 10 x 21
## `Medical Record… `Sequence Numbe… `Site - Primary… `Histology/Beha…
## <int> <int> <chr> <chr>
## 1 162232 0 C503 (BREAST LI… 85003 (INFILTRA…
## 2 502990 0 C508 (BREAST OV… 85003 (INFILTRA…
## 3 263178 0 C504 (BREAST UO… 85233 (INFILTRA…
## 4 690660 0 C504 (BREAST UO… 85003 (INFILTRA…
## 5 428083 0 C508 (BREAST OV… 85003 (INFILTRA…
## 6 336282 0 C504 (BREAST UO… 82113 (TUBULAR …
## 7 529556 0 C509 (BREAST NO… 85003 (INFILTRA…
## 8 178994 0 C505 (BREAST LO… 85003 (INFILTRA…
## 9 320079 0 C504 (BREAST UO… 85003 (INFILTRA…
## 10 528978 0 C505 (BREAST LO… 85203 (LOBULAR …
## # ... with 17 more variables: `Date of Diagnosis` <date>, `Age at
## # Diagnosis` <int>, `Age, Current` <int>, Sex <chr>, `Spanish/Hispanic
## # Origin` <chr>, Race <chr>, `Vital Status` <chr>, `Date Last Patient
## # Contact/Dead` <date>, `Survival Time (months)` <int>, `DC, Cause of
## # Death` <chr>, `Clinical Tumor Size` <dbl>, `Clinical Size Unit` <chr>,
## # `Tobacco Use (FCDS)` <chr>, `Summary of RX,1st Course` <chr>, `Payer
## # Source 1 at DX` <chr>, `Patient History, Alcohol` <chr>, `Marital
## # Status at DX` <chr>
task3_patients
## # A tibble: 10 x 8
## `Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
## <int> <int> <chr> <chr> <chr>
## 1 162232 74 FEMA… NON-SPANISH(0) BLACK
## 2 502990 75 FEMA… PUERTO RICAN(2) BLACK
## 3 263178 64 FEMA… NON-SPANISH(0) BLACK
## 4 690660 51 FEMA… NON-SPANISH(0) BLACK
## 5 428083 88 FEMA… NON-SPANISH(0) BLACK
## 6 336282 72 FEMA… UNKNOWN(9) BLACK
## 7 529556 63 FEMA… NON-SPANISH(0) BLACK
## 8 178994 46 FEMA… NON-SPANISH(0) BLACK
## 9 320079 66 FEMA… NON-SPANISH(0) BLACK
## 10 528978 57 FEMA… NON-SPANISH(0) BLACK
## # ... with 3 more variables: `Vital Status` <chr>, `Date Last Patient
## # Contact/Dead` <date>, `DC, Cause of Death` <chr>
semi_join(cr, task3_patients, by = "Medical Record Number") %>%
group_by(`Medical Record Number`) %>% count()
## # A tibble: 10 x 2
## # Groups: Medical Record Number [10]
## `Medical Record Number` n
## <int> <int>
## 1 162232 1
## 2 178994 1
## 3 263178 1
## 4 320079 1
## 5 336282 1
## 6 428083 1
## 7 502990 1
## 8 528978 1
## 9 529556 1
## 10 690660 1
BONUS: Find the diagnosis records of married African American patients using two joins.
task3_married <- cr %>%
filter(`Marital Status at DX` == "MARRIED") %>%
select(`Medical Record Number`, `Marital Status at DX`) %>%
distinct()
semi_join(cr, task3_patients, by = "Medical Record Number") %>%
semi_join(task3_married)
## Joining, by = c("Medical Record Number", "Marital Status at DX")
## # A tibble: 6 x 21
## `Medical Record… `Sequence Numbe… `Site - Primary… `Histology/Beha…
## <int> <int> <chr> <chr>
## 1 162232 0 C503 (BREAST LI… 85003 (INFILTRA…
## 2 263178 0 C504 (BREAST UO… 85233 (INFILTRA…
## 3 336282 0 C504 (BREAST UO… 82113 (TUBULAR …
## 4 178994 0 C505 (BREAST LO… 85003 (INFILTRA…
## 5 320079 0 C504 (BREAST UO… 85003 (INFILTRA…
## 6 528978 0 C505 (BREAST LO… 85203 (LOBULAR …
## # ... with 17 more variables: `Date of Diagnosis` <date>, `Age at
## # Diagnosis` <int>, `Age, Current` <int>, Sex <chr>, `Spanish/Hispanic
## # Origin` <chr>, Race <chr>, `Vital Status` <chr>, `Date Last Patient
## # Contact/Dead` <date>, `Survival Time (months)` <int>, `DC, Cause of
## # Death` <chr>, `Clinical Tumor Size` <dbl>, `Clinical Size Unit` <chr>,
## # `Tobacco Use (FCDS)` <chr>, `Summary of RX,1st Course` <chr>, `Payer
## # Source 1 at DX` <chr>, `Patient History, Alcohol` <chr>, `Marital
## # Status at DX` <chr>
First, verify that all patients in the diagnosis records data set cr
appear in the patient information data set using anti_join
Then, find all patients who have had more than one cancer diagnosis and create a data frame with only their patient information, using semi_join()
.
anti_join(cr_patients, cr, by = "Medical Record Number")
## # A tibble: 0 x 8
## # ... with 8 variables: `Medical Record Number` <int>, `Age,
## # Current` <int>, Sex <chr>, `Spanish/Hispanic Origin` <chr>,
## # Race <chr>, `Vital Status` <chr>, `Date Last Patient
## # Contact/Dead` <date>, `DC, Cause of Death` <chr>
task4_patients <- cr %>%
group_by(`Medical Record Number`) %>%
count() %>%
filter(n > 1)
semi_join(cr, task4_patients, by = "Medical Record Number")
## # A tibble: 34 x 21
## `Medical Record… `Sequence Numbe… `Site - Primary… `Histology/Beha…
## <int> <int> <chr> <chr>
## 1 687391 1 C509 (BREAST NO… 85202 (LOBULAR …
## 2 446531 1 C504 (BREAST UO… 85223 (INFILTRA…
## 3 334557 1 C504 (BREAST UO… 85003 (INFILTRA…
## 4 341050 1 C569 (OVARY) 84603 (PAPILLAR…
## 5 896356 2 C182 (COLON ASC… 81403 (ADENOCAR…
## 6 896356 1 C509 (BREAST NO… 85203 (LOBULAR …
## 7 558013 1 C649 (KIDNEY NO… 83123 (RENAL CE…
## 8 124227 0 C501 (BREAST CE… 85003 (INFILTRA…
## 9 341050 2 C508 (BREAST OV… 85003 (INFILTRA…
## 10 984584 1 C503 (BREAST LI… 85003 (INFILTRA…
## # ... with 24 more rows, and 17 more variables: `Date of
## # Diagnosis` <date>, `Age at Diagnosis` <int>, `Age, Current` <int>,
## # Sex <chr>, `Spanish/Hispanic Origin` <chr>, Race <chr>, `Vital
## # Status` <chr>, `Date Last Patient Contact/Dead` <date>, `Survival Time
## # (months)` <int>, `DC, Cause of Death` <chr>, `Clinical Tumor
## # Size` <dbl>, `Clinical Size Unit` <chr>, `Tobacco Use (FCDS)` <chr>,
## # `Summary of RX,1st Course` <chr>, `Payer Source 1 at DX` <chr>,
## # `Patient History, Alcohol` <chr>, `Marital Status at DX` <chr>
Find patients who were last contacted in 2017 and merge their patient information with their diagnosis records. Transform all dates to be the number of days since their earliest diagnosis.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
# Find patients last contacted in 2017
task5_selected <- cr_patients %>%
filter(year(`Date Last Patient Contact/Dead`) == 2017)
# Create a data frame with patient and earliest diagnosis
task5_first_dx <- cr %>%
group_by(`Medical Record Number`) %>%
summarize(first_diagnosis = min(`Date of Diagnosis`))
# Merge task5_selected and task5_first_dx so that selected patients also have
# a `first_diagnosis` column
task5_patients <- left_join(task5_selected, task5_first_dx, by = "Medical Record Number") %>%
select(-`Date Last Patient Contact/Dead`)
task5_patients %>%
select(`Medical Record Number`, first_diagnosis)
## # A tibble: 29 x 2
## `Medical Record Number` first_diagnosis
## <int> <date>
## 1 687391 1991-02-20
## 2 446531 1996-12-02
## 3 148456 1998-10-25
## 4 934875 2001-11-02
## 5 644232 2005-09-06
## 6 789242 2006-04-29
## 7 348966 2006-09-24
## 8 487851 2009-03-03
## 9 158774 2009-09-11
## 10 524664 2010-03-24
## # ... with 19 more rows
# Then join patient information and diagnosis records and transform dates.
full_join(task5_patients, cr, by = "Medical Record Number") %>%
mutate(
`Date Last Patient Contact/Dead` = `Date Last Patient Contact/Dead` - first_diagnosis,
`Date Last Patient Contact/Dead` = as.integer(`Date Last Patient Contact/Dead`),
`Date of Diagnosis` = `Date of Diagnosis` - first_diagnosis,
`Date of Diagnosis` = as.integer(`Date of Diagnosis`),
) %>%
select(`Medical Record Number`, starts_with("date"))
## # A tibble: 167 x 3
## `Medical Record Numbe… `Date of Diagnosis` `Date Last Patient Contact/…
## <int> <int> <int>
## 1 687391 0 9678
## 2 687391 6838 9678
## 3 446531 0 7358
## 4 446531 5529 7358
## 5 148456 0 6864
## 6 934875 0 5646
## 7 644232 0 4169
## 8 789242 0 3907
## 9 348966 0 4037
## 10 487851 0 3128
## # ... with 157 more rows