第19章 使用ggplot2进行高级绘图
# 目的:使用性状、颜色和尺寸来对多元数据进行可视化
#主要绘图包:ggplot2包 gridExtra包(或cowplot包)
#主要数据集:lattice包中的singer数据集、mtcars数据集、car包中的Salaries数据集
#观察各个数据集的基本情况
data(singer,package="lattice")
head(singer,5)
str(singer)
head(mtcars,5)
str(mtcars)
data(Salaries)
head(Salaries,5)
str(Salaries)
19.1 R中的四种图形系统
基础图形系统:R自带/grid包/lattice包/ggplot2包
grid图形系统(grid包):
用户在图像设备上随意创建矩形区域并定义坐标系统,然后使用绘图基础单元来控制图形元素的摆放与外观。对图形的灵活性很高,适用于软件开发者,但不提供生成统计图形以及完整绘图的函数,不适合数据分析师。
lattice图形系统(lattice包):
绘制网格图形,其显示一个变量的分布或者变量之间的关系,分别显示一个或多个变量的各个水平。基于grid包创建,处理多元数据的可视化功能很强。(一般为绘图的备选方案)
ggplot2图形系统(ggplot2包):
最常用的图形系统,一种全面的,基于语法的,连贯一致的图形生成系统。
19.2 ggplot2 包介绍
ggplot2语法:每一个函数修改属于自己的部分,函数与函数之间用(+)号函数串联
library(ggplot2)
ggplot(data=mtcars,aes(x=wt,y=mpg))+
geom_point()+
labs(title = "Automobile data",x="Miles per gallon")
#对散点图的点样式/颜色和大小进行设置;添加拟合曲线并设置线条的颜色和类型
ggplot(data=mtcars,aes(x=wt,y=mpg))+
geom_point(pch=17,color="forestgreen",size=2)+
geom_smooth(method = "lm",color="red",linetype=2)
labs(title = "Automobile data",x="Miles per gallon")
#使用分面
#对mtcars数据进行必要处理
#将am/vs和cyl转化成因子
mtcars$am <- factor(mtcars$am,levels = c(0,1),
labels = c("auto","manual"))
mtcars$vs <- factor(mtcars$vs,levels = c(0,1),
labels = c("v-engine","s-engine"))
mtcars$cyl <- factor(mtcars$cyl)
#绘图
ggplot(data=mtcars,aes(x=hp,y=mpg,shape=cyl,color=cyl))+ #cyl是分组变量
geom_point(size=3)+
facet_grid(am~vs)+ #am和vs是刻面变量
labs(title = "Automobile data by engine type",x="horsepower",y="Miles per gallon")
19.3 用几何函数指定图的类型
常用几何函数
几何函数的常见选项
绘图符号和线条类型
ggplot(data=Salaries,aes(x=rank,y=salary))+
geom_boxplot(fill="skyblue",color="black",notch = TRUE)+
geom_point(position = "jitter",color="blue",alpha=0.5,size=1)+
geom_rug(sides = "l",color="grey40")+
labs(title = "Automobile data by engine type",x="horsepower",y="Miles per gallon")
ggplot(data=singer,aes(x=voice.part,y=height))+
geom_violin(fill="lightblue")+
geom_boxplot(fill = "lightgreen",width=0.2)
labs(title = "violin combined with boxplot",x="Voice.part",y="height")
19.4 分组
在一个图中画出两个或多组的观察值,在R中一般通过分类变量的水平(因子)来定义
ggplot(data=Salaries,aes(x=salary,fill=rank))+
geom_density(alpha=0.3)
ggplot(data=Salaries,aes(x=yrs.since.phd,y=salary,color=rank,shape=sex))+
geom_point()
# 按学术等级和性别划分的教授数量
p1 <- ggplot(Salaries,aes(x=rank,fill=sex)) +
geom_bar(position="stack") + labs(title='position="stack"')p2 <- ggplot(Salaries,aes(x=rank,fill=sex)) +
geom_bar(position="dodge") + labs(title='position="dodge"')p3 <- ggplot(Salaries,aes(x=rank,fill=sex)) +
geom_bar(position="fill") + labs(title='position="fill"',y="proportion")library(cowplot)
plot_grid(p1,p2,p3,ncol=3)
p1 <- ggplot(Salaries,aes(x=rank,fill=sex)) + geom_bar()
p2 <- ggplot(Salaries,aes(x=rank)) + geom_bar(fill="red")
p3 <- ggplot(Salaries,aes(x=rank,fill="red")) + geom_bar()
plot_grid(p1,p2,p3,ncol=3)
#通常来说,变量应该设在aes()函数内,分配常数应该设在aes()函数外
19.5 刻面
ggplot2的刻面图函数
分别对以上语法进行实践,可以对比一下几张图的差异
ggplot(singer,aes(height)) +
geom_histogram(bins = 25,fill="forestgreen",color="black") +
facet_wrap(~voice.part,nrow = 4)
ggplot(singer,aes(height)) +
geom_histogram(bins = 25,fill="forestgreen",color="black") +
facet_wrap(~voice.part,ncol = 4)
library(car)
ggplot(Salaries,aes(x=yrs.since.phd,y=salary,color=rank,shape=rank)) +
geom_point(size=2) + facet_grid(.~sex)
ggplot(Salaries,aes(x=yrs.since.phd,y=salary,color=rank,shape=rank)) +
geom_point(size=2) + facet_grid(sex~.)
ggplot(Salaries,aes(x=yrs.since.phd,y=salary,color=rank,shape=rank)) +
geom_point(size=2) + facet_grid(discipline~sex)
ggplot(singer,aes(x=height,fill=voice.part)) +
geom_density() +
facet_grid(voice.part~.)
19.6 添加光滑曲线
# 探究博士毕业年数与薪水的关系 95%置信区间的非参数光滑曲线
ggplot(Salaries,aes(x=yrs.since.phd,y=salary))+
geom_smooth()+geom_point()
#按性别拟合一个二次多项式回归
ggplot(Salaries,aes(x=yrs.since.phd,y=salary,linetype=sex,shape=sex,color=sex)) +
geom_smooth(method = lm,formula = y~poly(x,2),se=FALSE,size=1) +
geom_point(size=0.8,alpha=0.5)
19.7 修改ggplot2图形的外观
19.7.1 坐标轴
ggplot(Salaries,aes(x=rank,y=salary,fill=sex)) +
geom_boxplot() +
scale_x_discrete(breaks = c("AsstProf","AssocProf","Prof"),
labels = c("Assistant\nProfessor",
"Associate\nProfessor",
"Full\nProfessor")) +
scale_y_continuous(breaks = c(50000,100000,150000,200000),
labels = c("$50K","$100K","$150K","$200K")) +
labs(title = "Faculty salary by rank and sex")
19.7.2 图例
图例是指出如何利用颜色,形状,尺寸等视觉特效来表示数据特征的指南。更改图例标题时
需要考虑图例是否基于颜色,形状,尺寸等具体内容,如下图中fill=sex图例代表性别,基于fill审美,可以通过将fill=“mytitle”添加到labs()函数中来改变标题;
图例位置通过theme()函数中的legend.position参数选项进行修改,参数值:left,right(默认),top,bottom,none(删除)或向量参数指定位置c(x,y)其中x,y的范围均在0~1之间,表示距离左侧边缘x底部边缘y
ggplot(Salaries,aes(x=rank,y=salary,fill=sex)) +
geom_boxplot() +
scale_x_discrete(breaks = c("AsstProf","AssocProf","Prof"),
labels = c("Assistant\nProfessor",
"Associate\nProfessor",
"Full\nProfessor")) +
scale_y_continuous(breaks = c(50000,100000,150000,200000),
labels = c("$50K","$100K","$150K","$200K")) +
labs(title = "Faculty salary by rank and sex",x="Rank",y="Salary",fill="Gender")+
theme(legend.position = c(0.15,0.85))
19.7.3 标尺
ggplot(mtcars,aes(x=wt,y=mpg,size=disp))+
geom_point(shape = 21, color = "black", fill = "pink") +
labs(title="Bublle chart",x="Weight",y="Miles per gallon",
size="Engine \ndisplacement")
ggplot(mtcars,aes(wt,mpg,fill = disp,size=disp))+
geom_point( shape = 21, color = "black") +
labs(title="Bublle chart",x="Weight",y="Miles per gallon",
size="Engine \ndisplacement")+
scale_fill_continuous(low = "yellow",high = "green")
离散型标尺将带有因子水平的视觉线索(如颜色,形状,尺寸,线条类型和透明度)关联起来
scale_color_manual(values)指定颜色
scale_color_brewer(palette =" name") 使用预先设定好的颜色集
display.brewer.all() 展示所有的内置颜色集,既name
ggplot(Salaries,aes(x=yrs.since.phd,y=salary,color=rank)) +
scale_color_manual(values=c("orange","olivedrab","navy")) +
geom_point(size=2)
#对比一下不同的颜色,将palette设置成不同的值即可获得,如Set1 Set2等
p1 <- ggplot(Salaries,aes(x=yrs.since.phd,y=salary,color=rank)) +
scale_color_brewer(palette ="Set1") +
geom_point(size=2)+labs(title="Set1")......
plot_grid(p1,p2,p3,p4,p5,p6,p7,p8,ncol = 4)
#颜色相关
library(RColorBrewer)
display.brewer.all()
R内置颜色集
#brewer.pal(3~n,name=)
# 3~n 表示颜色数量,3是最小值; n根据颜色集name中数量而定,name为R内置颜色集
display.brewer.pal(name = "Accent",n=5)
brewer.pal(5,name="Accent")
#: "#7FC97F" "#BEAED4" "#FDC086" "#FFFF99" "#386CB0"
ggplot(Salaries,aes(x=yrs.since.phd,y=salary)) + geom_point(color="#386CB0")
colorRampPalette(colors = c("red","blue"))(4)
# "#FF0000" "#AA0055" "#5500AA" "#0000FF"
还可以参见 RColorBrewer与ggplot2 - 简书 (jianshu.com)
#可以参见 http://colorbrewer2.org/#将设置好的统一的主题保存起来,可以应用到多个图中
mytheme <- theme(
plot.title = element_text(face = "bold.italic",size = 14,color = "brown"),
axis.title = element_text(face = "bold.italic",size = 10,color = "brown"),
axis.text = element_text(face = "bold",size = 9,color = "darkblue"),
panel.background = element_rect(fill = "white",color = "darkblue"),
panel.grid.major.y = element_line(color = "grey",linetype = 1),
panel.grid.minor.y = element_line(color = "grey",linetype = 2),
panel.grid.minor.x = element_blank(),
legend.position = "top")
ggplot(Salaries,aes(x=rank,y=salary,fill=sex)) +
geom_boxplot() +
labs(title = "Salary by Rank and Sex",x="Rank",y="Salary") +
mytheme
19.7.5 多重图
p1 <- ggplot(Salaries,aes(x=rank)) +
geom_bar() p2 <- ggplot(Salaries,aes(x=sex)) +
geom_bar()p3 <- ggplot(Salaries,aes(x=yrs.since.phd,y=salary)) +
geom_point() plot_grid(p1,p2,p3,ncol=3)
#保存图像
ggsave("mypicture.pdf", width =16,
height =16, units = "cm",dpi=300)#pdf可以是png ps tiff svg wmf jepg tex bmp 等其它格式