%load_ext rpy2.ipython
import pandas
listit_notes = pandas.read_csv('listit-notes.csv')
print(listit_notes)
%Rpush listit_notes
%%R
summary(listit_notes)
twitch_unlocks = pandas.read_csv('twitch-unlocks.csv')
%Rpush twitch_unlocks
print(twitch_unlocks)
%%R
summary(twitch_unlocks)
feedme = pandas.read_csv('feedme.csv')
%Rpush feedme
print(feedme)
%%R
summary(feedme)
addition_tasktime = pandas.read_csv('cs376_addition_tasktime.csv')
%Rpush addition_tasktime
print(addition_tasktime)
%%R
summary(addition_tasktime)
%%R
# convert all strings into factors
addition_tasktime$format <- as.factor(addition_tasktime$format)
addition_tasktime$interrupted <- as.factor(addition_tasktime$interrupted)
addition_tasktime$user_id <- as.factor(addition_tasktime$user_id)
summary(addition_tasktime)
%%R
# convert all strings into factors
feedme$preference <- as.factor(feedme$preference)
summary(feedme)
%%R
twitch_unlocks$phoneID <- as.factor(twitch_unlocks$phoneID)
twitch_unlocks$activity <- as.factor(twitch_unlocks$activity)
summary(twitch_unlocks)
%%R
summary(listit_notes)
%%R
library(ggplot2)
library(dplyr)
%%R
temp = table(feedme$preference)
temp = summarize(group_by(feedme, preference), Freq = n() )
temp = as.data.frame(temp)
ggplot(data = temp, aes(x=preference, y= Freq)) + geom_bar(stat="identity")
%%R
chisq.test(table(feedme$preference))
%%R
fmdfopin = subset(feedme, preference != "neither")
fmdfopin$preference = factor(fmdfopin$preference) ##this gets rid of "neither" as a factor
chisq.test(table(fmdfopin$preference))
%%R
ggplot(twitch_unlocks, aes(duration)) + geom_histogram() + facet_wrap(~activity)
%%R
ggplot(twitch_unlocks, aes(transformedDuration)) + geom_histogram() + facet_wrap(~activity)
%%R
summarize(group_by(twitch_unlocks, activity), round(mean(transformedDuration), 3))
%%R
anova <- aov(transformedDuration ~ activity, data=twitch_unlocks)
summary(anova)
%%R
TukeyHSD(anova)
%%R
anova2 <- aov(transformedDuration ~ activity * unlockNumberForPhone, data = twitch_unlocks)
summary(anova2)
%%R
qplot(listit_notes$length)
%%R
# remove outliers, ie notes of length greater than 500
listitfiltered = filter(listit_notes, length < 501)
## Bartlett tests to see if the variances are the same.
## H0: null that the variances in each of the groups (samples) are the same.
## HA: they are not
## What are the chances the true variances are the same given the data we have?
bartlett.test(listitfiltered, length ~ lifetime_under_one_day)
%%R
## p-value < 2.2e-16
## H0: There is no difference in the length of notes that live less than a day than those that do not.
## HA: There is a difference.
## What are the chances that the true length of the notes are the same given the data we have?
t.test(length ~ lifetime_under_one_day, data= listitfiltered, var.equal = F)
%%R
table(addition_tasktime$format, addition_tasktime$interrupted)
%%R
summarize(group_by(addition_tasktime, format, interrupted), mean = round(mean(task_time), 2), sd = round(sd(task_time), 2))
%%R
ggplot(addition_tasktime, aes(factor(format), task_time)) + geom_boxplot() + facet_wrap(~ interrupted)
%%R
ggplot(addition_tasktime, aes(task_time)) + geom_histogram() + facet_grid(format~interrupted)
%%R
## Ooh, some outliers in the micro/no cell. Since they're all in one cell, they should be inspected
## individually and see if something interesting is happening there.
filter(addition_tasktime, task_time > 149)
%%R
anova <- aov(addition_tasktime$task_time ~ addition_tasktime$format * addition_tasktime$interrupted ## this is the basic anova
+ Error(addition_tasktime$user_id / (addition_tasktime$format * addition_tasktime$interrupted))) ## this is the repeated measures
summary(anova)
%%R
## How to interpret:
## This code snippet show that there is a main effect of task format
## Error: mmtask$user_id:mmtask$format
## Df Sum Sq Mean Sq F value Pr(>F)
## mmtask$format 1 78841 78841 62.28 2.81e-09 ***
## Residuals 35 44308 1266
## This shows that being interrupted on it's own is not significant
## Error: mmtask$user_id:mmtask$interrupted
## Df Sum Sq Mean Sq F value Pr(>F)
## mmtask$interrupted 1 1659 1659.3 2.691 0.11
## Residuals 35 21583 616.7
## This shows that there is an interaction effect between the format
## and being interrupted
## Error: mmtask$user_id:mmtask$format:mmtask$interrupted
## Df Sum Sq Mean Sq F value Pr(>F)
## mmtask$format:mmtask$interrupted 1 2223 2222.9 4.624 0.0385 *
## Residuals 35 16824 480.7
## But maybe those outliers are driving the interaction effect, so
## we need to remove the three people with outlying observations from the analysis
temp = filter(addition_tasktime, task_time > 149) %>%
select(user_id)
mmtaskno = filter(addition_tasktime, !(user_id %in% temp$user_id))
table(mmtaskno$format, mmtaskno$interrupted)
%%R
ggplot(mmtaskno, aes(task_time)) + geom_histogram() + facet_grid(format~interrupted)
%%R
anova2 <- aov(mmtaskno$task_time ~ mmtaskno$format * mmtaskno$interrupted ## this is the basic anova
+ Error(mmtaskno$user_id / (mmtaskno$format * mmtaskno$interrupted)))
summary(anova2)
## this makes more sense. The interpretation, once removing outliers, is that
## there is a main effect of task format and being interrupted. The interaction
## of the two is marginal. This is why it's important to check for outliers first!