install.packages('dplyr')
使用平均值
一般使用散点图研究两个变量之间的关系
pf<-read.delim('pseudo_facebook.tsv')
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point()
qplot(age,friend_count,data=pf)
设置每20个点等效为1个点alpha=1/20 限定x取值范围
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha= 1/20 )+
xlim( 13,90 )
加入抖动噪声 jitter
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha= 1/20 )+
xlim( 13,90 )
防止好友数出现0或者虚数,将异常值传递给0,(h=0)并且对好友值取平方根,加入抖动噪声jitter
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha= 1/20 ,position = position_jitter(h = 0))+
xlim( 13,90 )+
coord_trans(y = 'sqrt')
0.4.0 版本的 dplyr 在汇总层上使用中间值函数时有一个错误,具体取决于被汇总的数据性质。在本地计算机上使用该包时,你可能需要将数据转换为数值型(浮点型),比如:median(as.numeric(var))
library(dplyr)
library(ggplot2)
age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
或者(在较新版本的 dplyr (0.3.x+) 中,语法 %.%
已被弃用且替换为 %>%)
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age)
head(pf.fc_by_age)
# A tibble: 6 x 4
age friend_count_mean friend_count_median n
<int> <dbl> <dbl> <int>
1 13 165. 74 484
2 14 251. 132 1925
3 15 348. 161 2618
4 16 352. 172. 3086
5 17 350. 156 3283
6 18 331. 162 5196
>
ggplot(aes(age,friend_count_mean),data=pf.fc_by_age)+
geom_point()
ggplot(aes(age,friend_count_mean),data=pf.fc_by_age)+
geom_line()
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha= 1/10 ,
position = position_jitter(h = 0),
color='orange')+
xlim( 13,90 )+
coord_trans(y = 'sqrt')
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha= 1/10 ,
position = position_jitter(h = 0),
color='orange')+
xlim( 13,90 )+
coord_trans(y = 'sqrt') +
geom_line(stat = 'summary', fun.y = mean)
ggplot(aes(x = age, y = friend_count), data = pf) +
xlim = c(13, 90)+
geom_point(alpha= 0.05 ,
position = position_jitter(h = 0),
color='orange')+
coord_trans(y = 'sqrt') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .1),
linetype= 2, color = 'blue') +
geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .5),
linetype= 2, color = 'blue') +
geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .9),
linetype= 2, color = 'blue')
ggplot(aes(x = age, y = friend_count), data = pf) +
coord_cartesian(xlim = c(13, 90),ylim = c(0,1000))+
geom_point(alpha= 0.05 ,
position = position_jitter(h = 0),
color='orange')+
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .1),
linetype= 2, color = 'blue') +
geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .5),
linetype= 2, color = 'blue') +
geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .9),
linetype= 2, color = 'blue')
得到相关系数下面两个式子相等
cor.test(pf$age, pf$friend_count, method = 'pearson')
with(pf,cor.test(age,friend_count,method='pearson'))
data: age and friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.03363072 -0.02118189
sample estimates:
cor
-0.02740737
默认是pearson方法 ,可以省略
with(subset(pf,age <=70),cor.test(age,friend_count,method='pearson'))
with(subset(pf,age <=70),cor.test(age,friend_count,method= 'spearman'))
names(pf)
ggplot(aes(x= www_likes_received,y= likes_received),data= pf)+
geom_point()
拟合曲线
首先计算相关性
cor.test(pf$www_likes_received,pf$likes_received)
发现0.9,有很强的相关性,之后进行曲线拟合
ggplot(aes(x= www_likes_received,y= likes_received),data= pf)+
geom_point() +
xlim(0,quantile(pf$www_likes_received,0.95))+
ylim(0,quantile(pf$likes_received,0.95)) +
geom_smooth(method = 'lm',color='red')
install.packages('alr3')
library(alr3)
data(Mitchell)
?Mitchell
下面两个相等
ggplot(data=Mitchell ,aes(x=Month,y=Temp)) +
geom_point()
qplot(data=Mitchell,Month,Temp)
data <- Mitchell
cor.test(data$Month,data$Temp)
将x轴变清晰
ggplot(data=Mitchell ,aes(x=Month,y=Temp)) +
geom_point() +
scale_x_continuous(breaks=seq(0,203,12))
很像正弦函数
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+
geom_point()
ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age)+geom_line()
可以看出上图噪声很多(不平滑部分)
head(pf.fc_by_age,10)
pf.fc_by_age[17:19,]
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
pf$age_with_months <- with(pf, age + (1 - dob_month / 12))
library(dplyr)
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median =median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
age_with_months_groups <- group_by(pf,age_with_months)
pf.fc_by_age_months2 <- summarise(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median =median(friend_count),
n = n())
pf.fc_by_age_months2 <- arrange(pf.fc_by_age_months2,age_with_months)
head(pf.fc_by_age_months2)
年龄小于71岁的人随出生月的变化
ggplot(aes(x= age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months ,age_with_months <71))+
geom_line()
p1 <-ggplot(aes(x= age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months ,age_with_months <71))+
geom_line()
p2 <-ggplot(aes(x=age,y=friend_count_mean),
data=subset(pf.fc_by_age,age<71))+
geom_line()
library(gridExtra)
grid.arrange(p2,p1,ncol=1)
p1 <-ggplot(aes(x= age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months ,age_with_months <71))+
geom_line()
p2 <-ggplot(aes(x=age,y=friend_count_mean),
data=subset(pf.fc_by_age,age<71))+
geom_line()
p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
data=subset(pf,age<71)) +
geom_line(stat = 'summary',fun.y=mean)
library(gridExtra)
grid.arrange(p3,p2,p1,ncol=1)
p1 <-ggplot(aes(x= age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months ,age_with_months <71))+
geom_line()+
geom_smooth()
p2 <-ggplot(aes(x=age,y=friend_count_mean),
data=subset(pf.fc_by_age,age<71))+
geom_line()+
geom_smooth()
p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
data=subset(pf,age<71)) +
geom_line(stat = 'summary',fun.y=mean)
library(gridExtra)
grid.arrange(p3,p2,p1,ncol=1)