1

I have a data frame (date, outcome of interest v169 and groups treated), like this:

Dataframe <- data.frame(
  date = structure(c(12482, 12499, 12478, 12484, 12477, 12492, 12475, 12490, 12490, 12482, 12488, 12474, 12487, 12474, 12473, 12473, 12478, 12474, 12481, 12474, 12489, 12485, 12479, 12479, 12479, 12479, 12481, 12477, 12474, 12481, 12481, 12478, 12478, 12473, 12479, 12479, 12478, 12482, 12480, 12478, 12480, 12479, 12475, 12481, 12480, 12477, 12477, 12477, 12476, 12476, 12474, 12473, 12474, 12483, 12472, 12479, 12481, 12488, 12481, 12482, 12481, 12482, 12488, 12478, 12474, 12481, 12481, 12480, 12478, 12479, 12475, 12476, 12478, 12482, 12479, 12478, 12478, 12477, 12479, 12479, 12479, 12479, 12478, 12480, 12478, 12487, 12482, 12475, 12475, 12474, 12474, 12478, 12473, 12485, 12482, 12473, 12474, 12472, 12478, 12478, 12479, 12479, 12488, 12476, 12492, 12493, 12479, 12482, 12480, 12476, 12476, 12482, 12479, 12475, 12472, 12475, 12475, 12475, 12482, 12482, 12482, 12478, 12480, 12485, 12480, 12482, 12481, 12480, 12480, 12480, 12480, 12478, 12481, 12478, 12478, 12479, 12481, 12481, 12482, 12482, 12482, 12479, 12478, 12476, 12483, 12475, 12477, 12477, 12480, 12485, 12485, 12479, 12476, 12480, 12476, 12481, 12485, 12479, 12480, 12484, 12479, 12481, 12487, 12490, 12486, 12482, 12480, 12494, 12493, 12485, 12479, 12477, 12477, 12481, 12481, 12483, 12480, 12479, 12483, 12472, 12474, 12471, 12482, 12479, 12489, 12480, 12494, 12481, 12483, 12483, 12488, 12471, 12476, 12482), class = "Date"),
  v169 = c(1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0),
  treated = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))

I want to plot the difference between treated == 1 and treated == 0 for the proportions of outcome v169 (like outlined here: Plot difference between proportions for subgroups) using this code:

Dataframe %>%
  mutate(
    treated2 = factor(treated, levels = c("0", "1")),
    date = as.POSIXct(date)) %>% #convert date
  group_by(treated2, date) %>% #group
  summarise(
    prop = sum(v169 == "1") / n()) %>% #calculate proportion
  spread(treated2, prop) %>%
  mutate(propdiff = 1 - 0) %>% #I tried using "1" and "0" here but then get the error "Error in mutate_impl(.data, dots) : Evaluation error: non-numeric argument to binary operator."
  ggplot(aes(date, propdiff)) +
  geom_line() + 
  geom_point()

Unfortunately, the solution doesn't work with this data set (the difference is always 1.00 (if propdiff = 1 - 0) or -1.00 (if propdiff = 0 - 1, i.e. one of the two data lines is not taken into consideration, it seems).

I can't figure out why - the code worked with my sample data set I provided in the other question but there seems to be a particularity with the data I can't pin down. I'm sure it's not something costing an experienced programmer a lot of headache but I can't figure it out. Could someone point me in the right direction?

markus
  • 25,843
  • 5
  • 39
  • 58
Ivo
  • 3,890
  • 5
  • 22
  • 53

1 Answers1

1

Try using backticks, i.e. `1` and `0` in your mutate call.

library(tidyverse)
Dataframe %>%
 mutate(
  treated2 = factor(treated, levels = c("0", "1")),
  date = as.POSIXct(date)) %>% 
 group_by(treated2, date) %>%
 summarise(
  prop = sum(v169 == "1") / n()) %>%
 spread(treated2, prop) %>%
 mutate(propdiff = `1` - `0`) %>% #use backticks here
 ggplot(aes(date, propdiff)) +
 geom_line() + 
 geom_point()

warning: 1: Removed 2 rows containing missing values (geom_path). 2: Removed 7 rows containing missing values (geom_point).

enter image description here


One approach to connect the points using geom_line in the presence of missing values taken from this answer: https://stackoverflow.com/a/9641380/8583393

df <- Dataframe %>%
 mutate(
  treated2 = factor(treated, levels = c("0", "1")),
  date = as.POSIXct(date)) %>% 
 group_by(treated2, date) %>%
 summarise(
  prop = sum(v169 == "1") / n()) %>%
  spread(treated2, prop) %>%
 mutate(propdiff = `1` - `0`)

df %>% 
 ggplot(aes(date, propdiff)) +
 geom_line(data = df[!is.na(df$propdiff), ]) + 
 geom_point()

enter image description here

markus
  • 25,843
  • 5
  • 39
  • 58
  • This was **exactly** what I meant - the easy thing I was blind to. Thanks a ton! Is there an easy way to smooth the line and force the connection of dots that are left out? – Ivo Apr 09 '18 at 11:41
  • @Ivo glad I could help. Edited the answer. – markus Apr 09 '18 at 12:25