Chapter 3 Data manipulation
3.1 Collapse multiple “no” and “yes” options
Common to have to do this in GlobalSurg projects:
library(dplyr)
= tibble(
mydata ssi.factor = c("No", "Yes, no treatment/wound opened only (CD 1)",
"Yes, antibiotics only (CD 2)", "Yes, return to operating theatre (CD 3)",
"Yes, requiring critical care admission (CD 4)",
"Yes, resulting in death (CD 5)",
"Unknown") %>%
factor(),
mri.factor = c("No, not available", "No, not indicated",
"No, indicated and facilities available, but patient not able to pay",
"Yes", "Unknown", "Unknown", "Unknown") %>%
factor()
)
# Two functions make this work
= function(.f){
fct_collapse_yn %>%
.f ::fct_relabel(~ gsub("^No.*", "No", .)) %>%
forcats::fct_relabel(~ gsub("^Yes.*", "Yes", .))
forcats
}
= function(.data){
is.yn = is.factor(.data)
.f = .data %>%
.yn levels() %>%
grepl("(No.+)|(Yes.+)", .) %>%
any()
all(.f, .yn)
}
# Raw variable
%>%
mydata count(ssi.factor)
## # A tibble: 7 × 2
## ssi.factor n
## <fct> <int>
## 1 No 1
## 2 Unknown 1
## 3 Yes, antibiotics only (CD 2) 1
## 4 Yes, no treatment/wound opened only (CD 1) 1
## 5 Yes, requiring critical care admission (CD 4) 1
## 6 Yes, resulting in death (CD 5) 1
## 7 Yes, return to operating theatre (CD 3) 1
# Collapse to _yn version
%>%
mydata mutate(across(where(is.yn), fct_collapse_yn, .names = "{col}_yn")) %>%
count(ssi.factor_yn)
## # A tibble: 3 × 2
## ssi.factor_yn n
## <fct> <int>
## 1 No 1
## 2 Unknown 1
## 3 Yes 5
3.2 Filtering best practice
3.2.1 From Jamie Farrell
Particularly useful in OpenSAFELY, but should be adopted by us all.
This creates inclusion flags, and then summarises those inclusion flags in a table. This allows us to see exactly how many rows are removed at each filter step.
The inclusion flags are then used to run the final filter.
library(finalfit)
library(tidyverse)
# Utility function ----
## Ignore, just needed for the example.
<- function(...) {
fct_case_when # uses dplyr::case_when but converts the output to a factor,
# with factors ordered as they appear in the case_when's ... argument
<- as.list(match.call())
args <- sapply(args[-1], function(f) f[[3]]) # extract RHS of formula
levels <- levels[!is.na(levels)]
levels factor(dplyr::case_when(...), levels=levels)
}
# Load data
= colon_s
colon_s
# Create inclusion flags:
# Age over 50, male, obstruction data not missing
= colon_s %>%
colon_s mutate(
is_age_abover_50 = age > 50,
is_male = sex.factor == "Male",
obstruction_not_missing = !is.na(obstruct.factor)
)
# Create exclusion table
= colon_s %>%
flowchart transmute(
c0 = TRUE,
c1 = c0 & is_age_abover_50,
c2 = c1 & is_male,
c3 = c2 & obstruction_not_missing,
%>%
) summarise(
across(.fns=sum)
%>%
) mutate(pivot_col = NA) %>%
pivot_longer(
cols=-pivot_col,
names_to="criteria",
values_to="n"
%>%
) select(-pivot_col) %>%
mutate(
n_exclude = lag(n) - n,
pct_all = (n/first(n)) %>% scales::percent(0.1),
pct_exclude_step = (n_exclude/lag(n)) %>% scales::percent(0.1),
crit = str_extract(criteria, "^c\\d+"),
criteria = fct_case_when(
== "c0" ~ "Original dataset",
crit == "c1" ~ " with age over 50 years",
crit == "c2" ~ " is male",
crit == "c3" ~ " obstruction data not missing",
crit TRUE ~ NA_character_
)%>%
) select(criteria, n, n_exclude, pct_all, pct_exclude_step)
flowchart
## # A tibble: 4 × 5
## criteria n n_exclude pct_all pct_exclude_step
## <fct> <int> <int> <chr> <chr>
## 1 "Original dataset" 929 NA 100.0% <NA>
## 2 " with age over 50 years" 732 197 78.8% 21.2%
## 3 " is male" 385 347 41.4% 47.4%
## 4 " obstruction data not missing" 375 10 40.4% 2.6%
Now filter the dataset in one step.
# Filtered dataset
= colon_s %>%
colon_filtered filter(
is_age_abover_50,
is_male,
obstruction_not_missing )
3.3 Filter NA: Dropping rows where all specified variables are NA
I want to keep rows that have a value for important_a
and/or important_b
(so rows 1, 3, 4).
I don’t care whether whatever_c
is empty or not, but I do want to keep it.
library(tidyverse)
= tibble(important_a = c("Value", NA, "Value", NA, NA),
mydata important_b = c(NA, NA, "Value", "Value", NA),
whatever_c = c(NA, "Value", NA, NA, NA))
mydata
## # A tibble: 5 × 3
## important_a important_b whatever_c
## <chr> <chr> <chr>
## 1 Value <NA> <NA>
## 2 <NA> <NA> Value
## 3 Value Value <NA>
## 4 <NA> Value <NA>
## 5 <NA> <NA> <NA>
Functions for missing values that are very useful, but don’t do what I want are:
- This keeps complete cases based on all columns:
%>%
mydata drop_na()
## # A tibble: 0 × 3
## # ℹ 3 variables: important_a <chr>, important_b <chr>, whatever_c <chr>
(Returns 0 as we don’t have rows where all 3 columns have a value).
- This keeps complete cases based on specified columns:
%>%
mydata drop_na(important_a, important_b)
## # A tibble: 1 × 3
## important_a important_b whatever_c
## <chr> <chr> <chr>
## 1 Value Value <NA>
This only keeps the row where both a and b have a value.
- This keeps rows that have a value in any column:
%>%
mydata filter_all(any_vars(! is.na(.)))
## # A tibble: 4 × 3
## important_a important_b whatever_c
## <chr> <chr> <chr>
## 1 Value <NA> <NA>
## 2 <NA> <NA> Value
## 3 Value Value <NA>
## 4 <NA> Value <NA>
The third example is better achieved using the janitor package:
%>%
mydata ::remove_empty() janitor
## # A tibble: 4 × 3
## important_a important_b whatever_c
## <chr> <chr> <chr>
## 1 Value <NA> <NA>
## 2 <NA> <NA> Value
## 3 Value Value <NA>
## 4 <NA> Value <NA>
Now, (3) is pretty close, but still, I’m not interested in row 2 - where both a and b are empty but c has a value (which is why it’s kept).
- Simple solution
A quick solution is to use ! is.na()
for each variable inside a filter()
:
%>%
mydata filter(! is.na(important_a) | ! is.na(important_b))
## # A tibble: 3 × 3
## important_a important_b whatever_c
## <chr> <chr> <chr>
## 1 Value <NA> <NA>
## 2 Value Value <NA>
## 3 <NA> Value <NA>
And this is definitely what I do when I only have a couple of these variables. But if you have tens, then the filtering logic becomes horrendously long and it’s easy to miss one out/make a mistake.
- Powerful solution:
A scalable solution is to use filter_at()
with vars()
with a select helper (e.g., starts with()
), and then the any_vars(! is.na(.))
that was introduced in (3).
%>%
mydata filter_at(vars(starts_with("important_")), any_vars(! is.na(.)))
## # A tibble: 3 × 3
## important_a important_b whatever_c
## <chr> <chr> <chr>
## 1 Value <NA> <NA>
## 2 Value Value <NA>
## 3 <NA> Value <NA>
3.4 Vectorising rowwise procedures
We frequently have to aggregate across columns. dplyr::rowwise()
is the group_by()
equivalent for rows. But this is painfully slow for large datasets.
The following works beautifully and allows tidyselect of variable names.
library(tidyverse)
= tibble(a = c(1,2,NA),
mydata b = c(2,3,NA),
c = c(3,4,NA),
d = c(NA,NA,NA))
%>%
mydata mutate(s = pmap_dbl(select(., a:d), pmax, na.rm=TRUE))
## # A tibble: 3 × 5
## a b c d s
## <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1 2 3 NA 3
## 2 2 3 4 NA 4
## 3 NA NA NA NA NA
3.5 Multiple imputation and IPW for missing data
Two approaches are commonly used for dealing with missing data.
Multiple imputation (MI) “fills in” missing values. Inverse probability weighting (IPW) “weights” observed values to reflect characteristics of missing data.
Missing data is discussed in detail here:https://finalfit.org/articles/missing.html
The IPW methods are taken from here: https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/
3.5.1 Data
library(finalfit)
library(tidyverse)
library(mice)
## Inspect dataset
# colon_s %>% ff_glimpse()
## Below we will use these variables to model,
## so remove missings from them and from outcome for this demonstration
= c("age",
explanatory "sex.factor",
"obstruct.factor",
"perfor.factor",
"adhere.factor",
"nodes",
"differ.factor",
"rx")
## Remove existing missings
= colon_s %>%
colon_s select(mort_5yr, explanatory) %>%
drop_na()
Here is the truth for mortality at 5 years in these data.
## Truth
%>%
colon_s summary_factorlist(explanatory = "mort_5yr")
## label levels all
## Mortality 5 year Alive 476 (55.7)
## Died 378 (44.3)
Let’s conditionally delete some outcomes.
set.seed(1234)
= colon_s %>%
colon_rs mutate(mort_5yr = ifelse(mort_5yr == "Alive",
sample(c("Alive", NA_character_),
size = sum(colon_s$mort_5yr == "Alive"),
replace = TRUE, prob = c(0.9, 0.1)),
sample(c("Died", NA_character_),
size = sum(colon_s$mort_5yr == "Died"),
replace = TRUE, prob = c(0.85, 0.15))) %>%
factor()
)
The proportion that died, is now lower due to our artifically introduced reporting bias:
%>%
colon_rs summary_factorlist(explanatory = "mort_5yr")
## label levels all
## mort_5yr Alive 423 (57.2)
## Died 316 (42.8)
3.5.2 Multiple imputation
First, let’s use multiple imputation to “fill in” missing outcome using the existing information we have about the patient.
## Choose which variables in dataset to include in imputation.
= colon_rs %>%
predM select(mort_5yr, explanatory) %>%
missing_predictorMatrix(
drop_from_imputed = c(""),
drop_from_imputer = c(""))
# This is needed to drop from imputed
# Include for completeness, but not used here.
= mice(colon_rs %>%
m0 select(mort_5yr, explanatory), maxit=0)
# m0$method[c("study_id")] = ""
# m0$method[c("redcap_data_access_group")] = ""
= colon_rs %>%
colon_rs_imputed select(mort_5yr, explanatory) %>%
mice(m = 10, maxit = 10, predictorMatrix = predM, method = m0$method,
print = FALSE) # print false just for example
## Check output here:
# summary(colon_rs_imputed)
# plot(colon_rs_imputed )
3.5.2.1 Reduce imputed sets (mean)
As can be seen, the imputed proportion approaches that of the “true” value.
%>%
colon_rs_imputed complete("all") %>%
map(~ (.) %>% count(mort_5yr) %>%
mutate(nn = sum(n),
prop = n/nn) %>%
select(-mort_5yr)) %>%
reduce(`+`) / 10
## n nn prop
## 1 484.8 854 0.5676815
## 2 369.2 854 0.4323185
3.5.3 IPW
Rather than filling in the missing values, we can weight the observed values to reflect the characteristics of the missing data.
## Define observed vs non-observed groups
= colon_rs %>%
colon_rs mutate(
group_fu = if_else(is.na(mort_5yr), 0, 1)
)
%>%
colon_rs count(group_fu)
## group_fu n
## 1 0 115
## 2 1 739
This method uses “stabilised weights”, meaning weights should have a mean of 1.
# Numerator model
= colon_rs %>%
fit_num glmmulti("group_fu", 1)
# Denominator model
= colon_rs %>%
fit_dem glmmulti("group_fu", explanatory)
# Check denominator model
%>%
fit_dem fit2df()
## Waiting for profiling to be done...
## explanatory OR
## 1 age 1.00 (0.99-1.02, p=0.766)
## 2 sex.factorMale 0.93 (0.62-1.38, p=0.710)
## 3 obstruct.factorYes 1.07 (0.65-1.83, p=0.803)
## 4 perfor.factorYes 4.71 (0.97-85.05, p=0.133)
## 5 adhere.factorYes 0.72 (0.43-1.26, p=0.235)
## 6 nodes 0.96 (0.91-1.01, p=0.083)
## 7 differ.factorModerate 1.61 (0.85-2.91, p=0.128)
## 8 differ.factorPoor 1.49 (0.69-3.15, p=0.301)
## 9 rxLev 1.01 (0.63-1.63, p=0.961)
## 10 rxLev+5FU 1.11 (0.68-1.82, p=0.687)
= colon_rs %>%
colon_rs mutate(
p = predict(fit_num, type = "response"),
pi = predict(fit_dem, type = "response"),
weights = case_when(
== 1 ~ p / pi,
group_fu == 0 ~ (1-p) / (1-pi)
group_fu
)
)
mean(colon_rs$weights)
## [1] 0.9987887
# Whoop whoop if ~1.0
A weighted proportion can then be generated, which gives a very similar result to the multiple imputation.
%>%
colon_rs mutate(
mort_5yr_wgt = (as.numeric(mort_5yr) - 1) * weights
%>%
) summary_factorlist(explanatory = c(" mort_5yr", " mort_5yr_wgt"), digits = c(4, 4, 3, 1, 1) )
## label levels all
## mort_5yr Alive 423.0 (57.2)
## Died 316.0 (42.8)
## mort_5yr_wgt Mean (SD) 0.4317 (0.5008)