library(tidyverse)
library(readxl)
library(readr)
Copied from RStudio’s “Import Dataset…”
cr_patients <- read_excel("CancerRegistryDataPatients.xlsx",
col_types = c("numeric", "numeric", "text",
"text", "text", "text", "date", "text"))
cr_patients <- read_csv("CancerRegistryDataPatients.csv",
col_types = cols(`Date Last Patient Contact/Dead` = col_date(format = "%m/%d/%y")))
# View(cr_patients)
glimpse(cr_patients)
Observations: 150
Variables: 8
$ `Medical Record Number` <int> 687391, 906008, 162232, 71024...
$ `Age, Current` <int> 72, 76, 74, 86, 66, 62, 70, 5...
$ Sex <chr> "FEMALE", "FEMALE", "FEMALE",...
$ `Spanish/Hispanic Origin` <chr> "NON-SPANISH(0)", "PUERTO RIC...
$ Race <chr> "WHITE", "WHITE", "BLACK", "W...
$ `Vital Status` <chr> "ALIVE", "ALIVE", "ALIVE", "D...
$ `Date Last Patient Contact/Dead` <date> 2017-08-20, 2018-01-15, 2016...
$ `DC, Cause of Death` <chr> "0", NA, NA, "C509", "C509", ...
head(cr_patients)
# A tibble: 6 x 8
`Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
<int> <int> <chr> <chr> <chr>
1 687391 72 FEMA… NON-SPANISH(0) WHITE
2 906008 76 FEMA… PUERTO RICAN(2) WHITE
3 162232 74 FEMA… NON-SPANISH(0) BLACK
4 710246 86 FEMA… NON-SPANISH(0) WHITE
5 317251 66 FEMA… DOMINICAN REPUB… WHITE
6 175897 62 FEMA… NON-SPANISH(0) WHITE
# ... with 3 more variables: `Vital Status` <chr>, `Date Last Patient
# Contact/Dead` <date>, `DC, Cause of Death` <chr>
tail(cr_patients)
# A tibble: 6 x 8
`Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
<int> <int> <chr> <chr> <chr>
1 240959 51 FEMA… SPANISH NOS(6) WHITE
2 446950 70 FEMA… NON-SPANISH(0) WHITE
3 884615 47 FEMA… SPANISH NOS(6) WHITE
4 712493 66 FEMA… NON-SPANISH(0) OTHER
5 757065 80 FEMA… NON-SPANISH(0) WHITE
6 901534 51 FEMA… NON-SPANISH(0) WHITE
# ... with 3 more variables: `Vital Status` <chr>, `Date Last Patient
# Contact/Dead` <date>, `DC, Cause of Death` <chr>
nrow(cr_patients)
[1] 150
names(cr_patients)
[1] "Medical Record Number" "Age, Current"
[3] "Sex" "Spanish/Hispanic Origin"
[5] "Race" "Vital Status"
[7] "Date Last Patient Contact/Dead" "DC, Cause of Death"
How many of each: Sex
?
count(cr_patients, Sex)
# A tibble: 1 x 2
Sex n
<chr> <int>
1 FEMALE 150
How many of each: Spanish/Hispanic Origin
?
count(cr_patients, `Spanish/Hispanic Origin`)
# A tibble: 6 x 2
`Spanish/Hispanic Origin` n
<chr> <int>
1 DOMINICAN REPUBLIC(8) 1
2 MEXICAN(1) 2
3 NON-SPANISH(0) 135
4 PUERTO RICAN(2) 5
5 SPANISH NOS(6) 6
6 UNKNOWN(9) 1
How many of each: Race
?
count(cr_patients, Race)
# A tibble: 3 x 2
Race n
<chr> <int>
1 BLACK 10
2 OTHER 9
3 WHITE 131
How many of each: Vital Status
+ DC, Cause of Death
?
count(cr_patients, `Vital Status`, `DC, Cause of Death`)
# A tibble: 5 x 3
`Vital Status` `DC, Cause of Death` n
<chr> <chr> <int>
1 ALIVE 0 103
2 ALIVE <NA> 16
3 DEAD 7777 1
4 DEAD C509 28
5 NONE 0 2
To make things easier, rename the columns representing cause_of_death
and vital_status
.
cr_patients <- cr_patients %>%
rename(cause_of_death = `DC, Cause of Death`,
vital_status = `Vital Status`)
cr_patients
# A tibble: 150 x 8
`Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
<int> <int> <chr> <chr> <chr>
1 687391 72 FEMA… NON-SPANISH(0) WHITE
2 906008 76 FEMA… PUERTO RICAN(2) WHITE
3 162232 74 FEMA… NON-SPANISH(0) BLACK
4 710246 86 FEMA… NON-SPANISH(0) WHITE
5 317251 66 FEMA… DOMINICAN REPUB… WHITE
6 175897 62 FEMA… NON-SPANISH(0) WHITE
7 197932 70 FEMA… NON-SPANISH(0) WHITE
8 446531 59 FEMA… NON-SPANISH(0) WHITE
9 148456 84 FEMA… SPANISH NOS(6) WHITE
10 874768 58 FEMA… NON-SPANISH(0) WHITE
# ... with 140 more rows, and 3 more variables: vital_status <chr>, `Date
# Last Patient Contact/Dead` <date>, cause_of_death <chr>
Create a new variable called hispanic
that is FALSE
if Spanish/Hispanic Origin
is not equal to NON-SPANISH(0)
or UNKNOWN(9)
.
cr_patients <- cr_patients %>%
mutate(hispanic = !(`Spanish/Hispanic Origin` %in% c("NON-SPANISH(0)", "UNKNOWN(9)")))
cr_patients
# A tibble: 150 x 9
`Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
<int> <int> <chr> <chr> <chr>
1 687391 72 FEMA… NON-SPANISH(0) WHITE
2 906008 76 FEMA… PUERTO RICAN(2) WHITE
3 162232 74 FEMA… NON-SPANISH(0) BLACK
4 710246 86 FEMA… NON-SPANISH(0) WHITE
5 317251 66 FEMA… DOMINICAN REPUB… WHITE
6 175897 62 FEMA… NON-SPANISH(0) WHITE
7 197932 70 FEMA… NON-SPANISH(0) WHITE
8 446531 59 FEMA… NON-SPANISH(0) WHITE
9 148456 84 FEMA… SPANISH NOS(6) WHITE
10 874768 58 FEMA… NON-SPANISH(0) WHITE
# ... with 140 more rows, and 4 more variables: vital_status <chr>, `Date
# Last Patient Contact/Dead` <date>, cause_of_death <chr>,
# hispanic <lgl>
Recode vital_status
: replace the value "NONE"
with a missing value. To do this, you can use the helper function na_if()
from dplyr.
cr_patients <- cr_patients %>%
mutate(vital_status = na_if(vital_status, "NONE"))
cr_patients
# A tibble: 150 x 9
`Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
<int> <int> <chr> <chr> <chr>
1 687391 72 FEMA… NON-SPANISH(0) WHITE
2 906008 76 FEMA… PUERTO RICAN(2) WHITE
3 162232 74 FEMA… NON-SPANISH(0) BLACK
4 710246 86 FEMA… NON-SPANISH(0) WHITE
5 317251 66 FEMA… DOMINICAN REPUB… WHITE
6 175897 62 FEMA… NON-SPANISH(0) WHITE
7 197932 70 FEMA… NON-SPANISH(0) WHITE
8 446531 59 FEMA… NON-SPANISH(0) WHITE
9 148456 84 FEMA… SPANISH NOS(6) WHITE
10 874768 58 FEMA… NON-SPANISH(0) WHITE
# ... with 140 more rows, and 4 more variables: vital_status <chr>, `Date
# Last Patient Contact/Dead` <date>, cause_of_death <chr>,
# hispanic <lgl>
Fix the following two issues with cause_of_death
using a single mutate()
.
The death certificate was not available for one patient (who was coded as "7777"
), but the certificate has been found and the code should be "C508"
.
If the patient is still alive but cause of death is missing, then the value should be "0"
.
cr_patients <- cr_patients %>%
mutate(
cause_of_death = recode(cause_of_death, "7777" = "C508"),
cause_of_death = if_else(is.na(cause_of_death) & vital_status == "ALIVE", "0", cause_of_death)
)
cr_patients
# A tibble: 150 x 9
`Medical Record… `Age, Current` Sex `Spanish/Hispan… Race
<int> <int> <chr> <chr> <chr>
1 687391 72 FEMA… NON-SPANISH(0) WHITE
2 906008 76 FEMA… PUERTO RICAN(2) WHITE
3 162232 74 FEMA… NON-SPANISH(0) BLACK
4 710246 86 FEMA… NON-SPANISH(0) WHITE
5 317251 66 FEMA… DOMINICAN REPUB… WHITE
6 175897 62 FEMA… NON-SPANISH(0) WHITE
7 197932 70 FEMA… NON-SPANISH(0) WHITE
8 446531 59 FEMA… NON-SPANISH(0) WHITE
9 148456 84 FEMA… SPANISH NOS(6) WHITE
10 874768 58 FEMA… NON-SPANISH(0) WHITE
# ... with 140 more rows, and 4 more variables: vital_status <chr>, `Date
# Last Patient Contact/Dead` <date>, cause_of_death <chr>,
# hispanic <lgl>
For this task, the following request was made:
MRN
, Age
, and Last Contact
#task1 <-
cr_patients %>%
filter(Race == "WHITE",
vital_status == "ALIVE",
`Age, Current` >= 55) %>%
arrange(desc(`Date Last Patient Contact/Dead`)) %>%
select(MRN = `Medical Record Number`,
Age = `Age, Current`,
`Last Contact` = `Date Last Patient Contact/Dead`)
# A tibble: 84 x 3
MRN Age `Last Contact`
<int> <int> <date>
1 654223 57 2019-01-30
2 780544 67 2019-01-20
3 175897 62 2018-12-29
4 344727 70 2018-12-29
5 492119 59 2018-12-18
6 151416 62 2018-11-17
7 820782 87 2018-10-12
8 523329 67 2018-08-25
9 194777 71 2018-08-08
10 363994 69 2018-07-11
# ... with 74 more rows
Create a folder called “output_04” in your project and save the output into the folder as a CSV file using write_csv()
.
write_csv("output_04/task1a.csv")
Copy the code from Task 1-A and then modify it to include only patients who were contacted since January 1, 2017. Export your results to a CSV file.
#task1b <-
cr_patients %>%
filter(Race == "WHITE",
vital_status == "ALIVE",
`Age, Current` >= 55,
`Date Last Patient Contact/Dead` >= "2017-01-01") %>%
arrange(desc(`Date Last Patient Contact/Dead`)) %>%
select(MRN = `Medical Record Number`,
Age = `Age, Current`,
`Last Contact` = `Date Last Patient Contact/Dead`)
# A tibble: 36 x 3
MRN Age `Last Contact`
<int> <int> <date>
1 654223 57 2019-01-30
2 780544 67 2019-01-20
3 175897 62 2018-12-29
4 344727 70 2018-12-29
5 492119 59 2018-12-18
6 151416 62 2018-11-17
7 820782 87 2018-10-12
8 523329 67 2018-08-25
9 194777 71 2018-08-08
10 363994 69 2018-07-11
# ... with 26 more rows
Calculate the average age of patients by race, vital status, and hispanic origin. Again, save your results as a CSV file.
# task2a <-
cr_patients %>%
group_by(Race, vital_status, hispanic) %>%
summarize(avg_age = mean(`Age, Current`))
# A tibble: 13 x 4
# Groups: Race, vital_status [?]
Race vital_status hispanic avg_age
<chr> <chr> <lgl> <dbl>
1 BLACK ALIVE FALSE 70.2
2 BLACK ALIVE TRUE 75
3 BLACK DEAD FALSE 53.3
4 OTHER ALIVE FALSE 70
5 OTHER ALIVE TRUE 75
6 OTHER DEAD FALSE 77
7 OTHER DEAD TRUE 53
8 OTHER <NA> FALSE 66
9 WHITE ALIVE FALSE 63.8
10 WHITE ALIVE TRUE 63.9
11 WHITE DEAD FALSE 66.5
12 WHITE DEAD TRUE 70.7
13 WHITE <NA> FALSE 80
In addition to the average age, report the minimum, maximum and count of patients in each group from Task 2-A. For the count, this time you can use the function n()
.
# task2b <-
cr_patients %>%
group_by(Race, vital_status, hispanic) %>%
summarize(avg_age = mean(`Age, Current`),
min_age = min(`Age, Current`),
max_age = max(`Age, Current`),
n = n())
# A tibble: 13 x 7
# Groups: Race, vital_status [?]
Race vital_status hispanic avg_age min_age max_age n
<chr> <chr> <lgl> <dbl> <dbl> <dbl> <int>
1 BLACK ALIVE FALSE 70.2 57 88 6
2 BLACK ALIVE TRUE 75 75 75 1
3 BLACK DEAD FALSE 53.3 46 63 3
4 OTHER ALIVE FALSE 70 50 84 4
5 OTHER ALIVE TRUE 75 68 82 2
6 OTHER DEAD FALSE 77 77 77 1
7 OTHER DEAD TRUE 53 53 53 1
8 OTHER <NA> FALSE 66 66 66 1
9 WHITE ALIVE FALSE 63.8 24 90 99
10 WHITE ALIVE TRUE 63.9 47 84 7
11 WHITE DEAD FALSE 66.5 37 90 21
12 WHITE DEAD TRUE 70.7 62 84 3
13 WHITE <NA> FALSE 80 80 80 1
Calculate the average age of patients by year of last contact. To do this, you can load the lubridate
package and use the function year()
on any columns with dates.
library(lubridate)
Attaching package: 'lubridate'
The following object is masked from 'package:base':
date
#task2c <-
cr_patients %>%
mutate(year = year(`Date Last Patient Contact/Dead`)) %>%
group_by(year) %>%
summarize(avg_age = mean(`Age, Current`))
# A tibble: 9 x 2
year avg_age
<dbl> <dbl>
1 2011 72
2 2012 62.6
3 2013 68
4 2014 65.3
5 2015 67.4
6 2016 65.4
7 2017 61.4
8 2018 64.7
9 2019 58.3
Export a CSV containing the following information:
#task3 <-
cr_patients %>%
filter(vital_status == "DEAD", `Age, Current` >= 55) %>%
arrange(`Medical Record Number`) %>%
select(MRN = `Medical Record Number`, Age = `Age, Current`, Race, CoD = cause_of_death) %>%
mutate(Race = if_else(Race == "WHITE", "Caucasian", "Other"))
# A tibble: 21 x 4
MRN Age Race CoD
<int> <int> <chr> <chr>
1 317251 66 Caucasian C509
2 347062 62 Caucasian C509
3 359703 84 Caucasian C509
4 385615 72 Caucasian C509
5 390671 60 Caucasian C509
6 414593 58 Caucasian C509
7 423286 67 Caucasian C509
8 424101 63 Caucasian C509
9 424590 90 Caucasian C509
10 435037 66 Caucasian C509
# ... with 11 more rows
Read in the data file CancerRegistryData.csv
and answer the following questions:
In which year were the most patients diagnosed?
What is the average, minimum and maximum age at diagnosis in each year?
Do all of the values from the previous question make sense? If not, fix or remove the patients with bad values in the next step.
What is the average age at diagnosis, average survival time in years and number of patients diagnosed for each primary cancer site, ordered by prevalence?
CancerRegistryData <- read_csv("CancerRegistryData.csv",
col_types = cols(`Date Last Patient Contact/Dead` = col_datetime(format = "%m/%d/%y"),
`Date of Diagnosis` = col_datetime(format = "%m/%d/%y")))
CancerRegistryData %>%
mutate(year = lubridate::year(`Date of Diagnosis`)) %>%
count(year) %>%
arrange(desc(n))
# A tibble: 24 x 2
year n
<dbl> <int>
1 2013 21
2 2011 19
3 2012 19
4 2014 16
5 2010 14
6 2009 11
7 2015 11
8 2008 8
9 2016 6
10 2006 5
11 2007 5
12 NA 5
13 2005 4
14 1991 3
15 1996 3
16 2001 3
17 2002 3
18 1998 2
19 2000 2
20 2003 2
21 2004 2
22 1992 1
23 1993 1
24 1999 1
CancerRegistryData %>%
mutate(year = lubridate::year(`Date of Diagnosis`)) %>%
group_by(year) %>%
summarize(age = mean(`Age at Diagnosis`),
min = min(`Age at Diagnosis`),
max = max(`Age at Diagnosis`))
# A tibble: 24 x 4
year age min max
<dbl> <dbl> <dbl> <dbl>
1 1991 47.7 45 49
2 1992 65 65 65
3 1993 43 43 43
4 1996 43.3 39 52
5 1998 52 40 64
6 1999 71 71 71
7 2000 57 53 61
8 2001 54.7 40 71
9 2002 51.7 38 60
10 2003 59 59 59
11 2004 51 46 56
12 2005 57.5 45 74
13 2006 64.2 51 80
14 2007 58 54 64
15 2008 63.5 46 81
16 2009 52.7 37 70
17 2010 56.4 29 79
18 2011 58 30 83
19 2012 60.9 35 89
20 2013 60.4 36 85
21 2014 57.6 22 77
22 2015 61.6 34 84
23 2016 61 46 85
24 NA 619. 41 999
CancerRegistryData %>%
filter(!is.na(`Sequence Number`)) %>%
group_by(`Site - Primary (ICD-O-3)`) %>%
summarize(
age = mean(`Age at Diagnosis`),
survival_time = mean(`Survival Time (months)`),
n = n()
) %>%
mutate(survival_time = survival_time / 12) %>%
arrange(desc(n))
# A tibble: 19 x 4
`Site - Primary (ICD-O-3)` age survival_time n
<chr> <dbl> <dbl> <int>
1 C504 (BREAST UOQ) 55.6 6.42 45
2 C508 (BREAST OVERLAPPING LESION) 60.0 5.54 33
3 C509 (BREAST NOS) 59.3 8.65 26
4 C502 (BREAST UIQ) 58.8 4.76 18
5 C503 (BREAST LIQ) 51.8 7.46 13
6 C501 (BREAST CENTRAL) 60.5 5.65 6
7 C505 (BREAST LOQ) 54.3 3.81 6
8 C541 (ENDOMETRIUM) 74.2 2.5 4
9 C569 (OVARY) 67.5 8.08 2
10 C182 (COLON ASCENDING) 59 12 1
11 C383 (MEDIASTINUM NOS) 22 1.75 1
12 C421 (BONE MARROW) 62 4.42 1
13 C506 (BREAST TAIL) 46 3.25 1
14 C529 (VAGINA) 65 3.25 1
15 C649 (KIDNEY NOS) 58 11.2 1
16 C700 (CRANIAL MENINGES) 78 3.17 1
17 C739 (THYROID GLAND) 54 5.08 1
18 C778 (LYMPH NODES MULTIPLE) 77 0.833 1
19 C809 (UNKNOWN PRIMARY SITE) 63 0.333 1