1.因子

r语言 因子 r语言中的因子型变量_取值


变量可分为:名义型变量、有序性变量(数据间有顺序,但不是连续的)、连续性变量

因子:名义型变量、有序性变量.它们的值被称为水平level.由水平值构成的向量称为因子。

因子可以简单看作一个附加了更多信息的向量。因子最基本的特征是”水平“

> y<-c(2,5,8,12)
> yf<-factor(y)
> yf
[1] 2  5  8  12
Levels: 2 5 8 12
> str(yf)  ###查看y的内部结构
 Factor w/ 4 levels "2","5","8","12": 1 2 3 4
> unclass(yf) ###
[1] 1 2 3 4
attr(,"levels")
[1] "2"  "5"  "8"  "12"

创建因子

attach(mtcars)#####加载mtcars数据集到当前窗口,可以不用使用$
detach(mtcars)######退出该数据集
with(mtcars,{hp})###查看变量hp的所有取值
#####因子的创建
f<-factor(c("red","yellow","blue"),ordered = T,levels = c("red","yellow","blue"))
fcyl<-factor(mtcars$cyl)
plot(mtcars$cyl)
plot(fcyl)      ####因子分类,绘制频数条形图

因子的常用函数

tapply()

r语言 因子 r语言中的因子型变量_Time_02


tapply()函数的用法:

tapply(x,f,g) 向量x、因子或因子列表f、函数g

###将x分组,每组对应一个因子水平(一组因子水平的组合),得到X的子向量,然后对这些向量应用函数。

> ages<-c(34,25,28,37,43,35)
> affils<-c("R","D","D","R","U","D")
> tapply(ages,affils,mean)
       D        R        U 
29.33333 35.50000 43.00000

例子:返回以下几个子组的平均收入

r语言 因子 r语言中的因子型变量_r语言 因子_03

> d<-data.frame(list(sex=c("M","M","F","M","F","F"),age=c(47,59,21,32,33,34),income=c(55000,88000,32450,76500,123000,45650)))
> d
  sex age income
1   M  47  55000
2   M  59  88000
3   F  21  32450
4   M  32  76500
5   F  33 123000
6   F  34  45650
> d$over25<-ifelse(d$age>25,1,0) ###按照年龄分组
> d
  sex age income over25
1   M  47  55000      1
2   M  59  88000      1
3   F  21  32450      0
4   M  32  76500      1
5   F  33 123000      1
6   F  34  45650      1
> tapply(d$income,list(d$over25,d$sex),mean)
      F        M
0 32450       NA
1 84325 73166.67

split()

r语言 因子 r语言中的因子型变量_r语言 因子_04


r语言 因子 r语言中的因子型变量_r语言 因子_05


输出的是列表

> split(d$income,list(d$over25,d$sex))
$`0.F`
[1] 32450

$`1.F`
[1] 123000  45650

$`0.M`
numeric(0)

$`1.M`
[1] 55000 88000 76500

by()

r语言 因子 r语言中的因子型变量_取值_06

> f<-function(m){
+   lm(abamf$Diameter~abamf$Length)###鲍鱼直径对长度的回归
+ }
> by(abamf,abamf$Sex,f) ###按性别做鲍鱼直径对长度的回归
abamf$Sex: 0

Call:
lm(formula = abamf$Diameter ~ abamf$Length)

Coefficients:
 (Intercept)  abamf$Length  
    -0.01197       0.80480  

------------------------------------------------------- 
abamf$Sex: 1

Call:
lm(formula = abamf$Diameter ~ abamf$Length)

Coefficients:
 (Intercept)  abamf$Length  
    -0.01197       0.80480

2.表

> u<-c(22,8,33,6,8,29,-2)
> f1<-c("a","bc","a","a","bc","a","a")
> tapply(u,f1,length)  ####求长度,与u的元素具体取值无关
 a bc 
 5  2 
 
> u<-c(22,8,33,6,8,29,-2)
> f1<-list(c(5,12,13,12,13,5,13),c("a","bc","a","a","bc","a","a"))
> f1
[[1]]
[1]  5 12 13 12 13  5 13

[[2]]
[1] "a"  "bc" "a"  "a"  "bc" "a"  "a" 

> tapply(u,f1,length) ##因为与u的具体取值无关,即求的是水平的长度,而按照水平随c(5,12,13,12,13,5,13)进行分组,即求5、12、13的长度
   a bc
5  2 NA
12 1  1
13 2  1

上述结果其实是列联表。
因为5没出现过bc,所以对应位置应该写0,所以用table()函数建立表。table()的第一个参数是因子或者是因子的列表
table()函数计算频数

> f1<-list(c(5,12,13,12,13,5,13),c("a","bc","a","a","bc","a","a"))
> table(f1)
    f1.2
f1.1 a bc
  5  2  0
  12 1  1
  13 2  1

r语言 因子 r语言中的因子型变量_取值_07

> x<-c("Yes","Yes","Yes","No","No" ,"No","Not Sure", "Yes","No","No") 
> > xm<-matrix(x,5,2,byrow = T)
> xm
     [,1]       [,2] 
[1,] "Yes"      "Yes"
[2,] "Yes"      "No" 
[3,] "No"       "No" 
[4,] "Not Sure" "Yes"
[5,] "No"       "No" 
> xd<-as.data.frame(xm)
> xd
        V1  V2
1      Yes Yes
2      Yes  No
3       No  No
4 Not Sure Yes
5       No  No
> colnames(xd)<-c("Vote.x","Vote.X.Last.Time")
> xd
    Vote.x Vote.X.Last.Time
1      Yes              Yes
2      Yes               No
3       No               No
4 Not Sure              Yes
5       No               No
> table(xd)  ########生成该投票数据的表
          Vote.X.Last.Time
Vote.x     No Yes
  No        2   0
  Not Sure  0   1
  Yes       1   1

R也可以生成3维表

> v<-list(gender=c("M","M","F","M","F","F"),race=c("W","W","A","O","B","B"),pol=c("L","L","C","L","L","c"))
> vd<-as.data.frame(v)
> vd
  gender race pol
1      M    W   L
2      M    W   L
3      F    A   C
4      M    O   L
5      F    B   L
6      F    B   c
> vdt<-table(vd)  ###生成3维表
> vdt
, , pol = c

      race
gender A B O W
     F 0 1 0 0
     M 0 0 0 0

, , pol = C

      race
gender A B O W
     F 1 0 0 0
     M 0 0 0 0

, , pol = L

      race
gender A B O W
     F 0 1 0 0
     M 0 0 1 2

表中有关矩阵和类似数组的操作

> class(vdt)  ###查看vdt的类
[1] "table"
> xdt<-table(xd)
> xdt[1,2]   ###访问表中1行2列的元素
[1] 0
> xdt[2,]
 No Yes 
  0   1

用标量乘以矩阵,addmargins() ###计算边际值,dimnames() ####获取维度名称和水平值

> xdt/5
          Vote.X.Last.Time
Vote.x      No Yes
  No       0.4 0.0
  Not Sure 0.0 0.2
  Yes      0.2 0.2
  > apply(xdt,1,sum)
      No Not Sure      Yes 
       2        1        2 
> addmargins(xdt) ###计算边际值
          Vote.X.Last.Time
Vote.x     No Yes Sum
  No        2   0   2
  Not Sure  0   1   1
  Yes       1   1   2
  Sum       3   2   5
  > dimnames(xdt)  ####获取维度名称和水平值
$Vote.x
[1] "No"       "Not Sure" "Yes"     

$Vote.X.Last.Time
[1] "No"  "Yes"

aggregate()

r语言 因子 r语言中的因子型变量_r语言 因子_08


aggregate()
##操作对象是数据框,第2个参数必须是列表,最后一个是应用的函数

> aggregate(aba[,-1],list(aba$Sex),median)
  Group.1 Length Diameter Height Whole.weight Shucked.weight Viscera.weight Shell.weight
1       F  0.590    0.465  0.160      1.03850        0.44050         0.2240        0.295
2       I  0.435    0.335  0.110      0.38400        0.16975         0.0805        0.113
3       M  0.580    0.455  0.155      0.97575        0.42175         0.2100        0.276
  Rings
1    10
2     8
3    10

cut()

cut()是生成因子的常用方法,常用于表的操作
cut(x,c,labels=FALSE)

> num<-c(1,3,4,5,5.5,2.4,8,7.8)
> cut(num,seq(1,10,2)) ####看num中元素落入哪个区域,输出结果是落入的区域
[1] <NA>  (1,3] (3,5] (3,5] (5,7] (1,3] (7,9] (7,9]
Levels: (1,3] (3,5] (5,7] (7,9]
> cut(num,seq(0,10,2))
[1] (0,2] (2,4] (2,4] (4,6] (4,6] (2,4] (6,8] (6,8]
Levels: (0,2] (2,4] (4,6] (6,8] (8,10]