R语言学习之数据结构

一、向量

向量:用于存储数值型、字符型或逻辑型数据的一维数组。
用函数c来创建向量

x <- c(1,2,3,4,5)
y <- c("one","two","three")
 z <- c(TRUE,T,F)
c(1:100)  #输出1到100
seq(from=1,to=100,by=2) #输出1到100,间隔为2
seq(from=1,to=100,length.out=10)  #从1等间距输出10个数到100

用r语言创建向量 r语言创建向量1到100_用r语言创建向量

x <- c(1,2,3,4,5)
rep(x,5)
rep(x,each=5)
rep(x,c(2,1,4,2,3))

用r语言创建向量 r语言创建向量1到100_R3_02

用r语言创建向量 r语言创建向量1到100_R3_03

mode(a)  #查看a的数据类型

向量索引

x <- c(1:100)
length(x)       #x的长度
x[1]            #向量中索引从1开始
x[0]
x[-19]          #x中除了序号为19的全部输出
x[c(4:18)]
x[c(1,23,45,67,89)]

用r语言创建向量 r语言创建向量1到100_R3_04

y<-c(1:10)
y[c(T,F,T,T,F,F,T,F,T,T)]
y[c(T)]
y[c(F,T)]
y[y>5]
y[y>5 & y<9]

用r语言创建向量 r语言创建向量1到100_R3_05

z <- c("one","two","three","four","five")
"one" %in% z
z["one" %in% z]
z %in% c("one","two")

用r语言创建向量 r语言创建向量1到100_缺失值_06

插入元素

v[20] <- 4
append(x = v,values = 99,after = 5)  #在索引为5后加入99,原向量未变
append(x = v,values = 99,after = 0)

用r语言创建向量 r语言创建向量1到100_缺失值_07

删除元素

v <- v[-c(1:3)]

用r语言创建向量 r语言创建向量1到100_R3_08

向量运算

规则是对应位置进行运算,长的向量的长度必须是短的向量长度的整数倍

x <- 1:10
x+1
x-3
x <- x+1
y <- seq(1,100,length.out = 10)
x+y    #对应位置相加
x*y    #相乘
x**y   #乘方运算
y%%x   #取余
y%/%x  #整除运算

用r语言创建向量 r语言创建向量1到100_R3_09

#逻辑运算
x>y
x==y

用r语言创建向量 r语言创建向量1到100_数据_10

#函数运算
> x
 [1] -5 -4 -3 -2 -1  0  1  2  3  4  5
> abs(x)              #绝对值
 [1] 5 4 3 2 1 0 1 2 3 4 5
> sqrt(25)            #开方
[1] 5
> log(16,base = 2)    #对数
[1] 4
> log(16)             #以e为底
[1] 2.772589
> log10(10)
[1] 1
> exp(x)             #e的x次方
 [1] 6.737947e-03 1.831564e-02 4.978707e-02 1.353353e-01 3.678794e-01
 [6] 1.000000e+00 2.718282e+00 7.389056e+00 2.008554e+01 5.459815e+01
[11] 1.484132e+02
> ceiling(c(-2.3,3.1415))   #不低于x的最小整数
[1] -2  4
> floor(c(-2.3,3.1415))     #不超过x的最大整数
[1] -3  3
> trunc(c(-2.3,3.1415))     #x的整数部分
[1] -2  3
> round(c(-2.3,3.1415),digits = 2)   #保留几位小数
[1] -2.30  3.14
> signif(c(-2.3,3.1415),digits = 2)  #保留有效数字
[1] -2.3  3.1
> sin(x)
 [1]  0.9589243  0.7568025 -0.1411200 -0.9092974 -0.8414710  0.0000000
 [7]  0.8414710  0.9092974  0.1411200 -0.7568025 -0.9589243
> cos(x)
 [1]  0.2836622 -0.6536436 -0.9899925 -0.4161468  0.5403023  1.0000000
 [7]  0.5403023 -0.4161468 -0.9899925 -0.6536436  0.2836622
> vec <- 1:100
> vec
  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
 [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
 [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
 [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
 [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
 [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
> sum(vec)   #求和
[1] 5050
> max(vec)   #最大值
[1] 100
> min(vec)   #最小值
[1] 1
> range(vec)  #范围
[1]   1 100
> mean(vec)   #平均值
[1] 50.5
> var(vec)    #方差
[1] 841.6667
> sd(vec)     #标准差
[1] 29.01149
> prod(vec)   #连乘
[1] 9.332622e+157
> median(vec)  #中位数
[1] 50.5
> quantile(vec)  #分位数
    0%    25%    50%    75%   100% 
  1.00  25.75  50.50  75.25 100.00 
> quantile(vec,c(0.4,0.6,0.8))
 40%  60%  80% 
40.6 60.4 80.2
> t <- c(1,4,2,5,7,9,6)
> t
[1] 1 4 2 5 7 9 6
> which.max(t)   #最大值的位置
[1] 6
> which.min(t)   #最小值的位置
[1] 1
> which(t==7)  
[1] 5
> which(t>7)
[1] 6

二、矩阵和数组

> x <- 1:20
> x
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
> m <- matrix(x,nrow = 4,ncol = 5)
> m <- matrix(1:20,4,5)
> m
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    5    9   13   17
[2,]    2    6   10   14   18
[3,]    3    7   11   15   19
[4,]    4    8   12   16   20

> rnames <- c("R1","R2","R3","R4")
> rnames
[1] "R1" "R2" "R3" "R4"
> cnames <- c("C1","C2","C3","C4","C5")
> cnames
[1] "C1" "C2" "C3" "C4" "C5"
> dimnames(m) <- list(rnames,cnames)
> m
   C1 C2 C3 C4 C5
R1  1  2  3  4  5
R2  6  7  8  9 10
R3 11 12 13 14 15
R4 16 17 18 19 20
> dim(x)
NULL
> dim(x) <- c(4,5)
> x
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    5    9   13   17
[2,]    2    6   10   14   18
[3,]    3    7   11   15   19
[4,]    4    8   12   16   20
> ?array
> dim1 <- c("A1","A2")
> dim2 <- c("B1","B2","B3")
> dim3 <- c("C1","C2","C3","C4")
> z <- array(1:24,c(2,3,4),dimnames = list(dim1,dim2,dim3))
> z
, , C1

   B1 B2 B3
A1  1  3  5
A2  2  4  6

, , C2

   B1 B2 B3
A1  7  9 11
A2  8 10 12

, , C3

   B1 B2 B3
A1 13 15 17
A2 14 16 18

, , C4

   B1 B2 B3
A1 19 21 23
A2 20 22 24

矩阵索引

> m
   C1 C2 C3 C4 C5
R1  1  2  3  4  5
R2  6  7  8  9 10
R3 11 12 13 14 15
R4 16 17 18 19 20
> m[1,2]
[1] 2
> m[c(2:4),c(2,3)]
   C2 C3
R2  7  8
R3 12 13
R4 17 18
> m[2,]
C1 C2 C3 C4 C5 
 6  7  8  9 10 
> m[,2]
R1 R2 R3 R4 
 2  7 12 17 
> m[2]
[1] 6
> m[-1,2]
R2 R3 R4 
 7 12 17 
> m["R1","C2"]
[1] 2

矩阵运算

> m
   C1 C2 C3 C4 C5
R1  1  2  3  4  5
R2  6  7  8  9 10
R3 11 12 13 14 15
R4 16 17 18 19 20
> m+1
   C1 C2 C3 C4 C5
R1  2  3  4  5  6
R2  7  8  9 10 11
R3 12 13 14 15 16
R4 17 18 19 20 21
> m*2
   C1 C2 C3 C4 C5
R1  2  4  6  8 10
R2 12 14 16 18 20
R3 22 24 26 28 30
R4 32 34 36 38 40
> m+m
   C1 C2 C3 C4 C5
R1  2  4  6  8 10
R2 12 14 16 18 20
R3 22 24 26 28 30
R4 32 34 36 38 40
> colSums(m)       #列求和
C1 C2 C3 C4 C5 
34 38 42 46 50 
> colMeans(m)      #列的平均值
  C1   C2   C3   C4   C5 
 8.5  9.5 10.5 11.5 12.5 
> rowMeans(m)      #行的平均值
R1 R2 R3 R4    
 3  8 13 18 
> n <- matrix(1:9,3,3)
> t <- matrix(2:10,3,3)
> n
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9
> t
     [,1] [,2] [,3]
[1,]    2    5    8
[2,]    3    6    9
[3,]    4    7   10
> n*t             #内积,对应元素相乘
     [,1] [,2] [,3]
[1,]    2   20   56
[2,]    6   30   72
[3,]   12   42   90
> n %*% t        #外积,线性代数知识
     [,1] [,2] [,3]
[1,]   42   78  114
[2,]   51   96  141
[3,]   60  114  168
> diag(n)       #返回对角线元素
[1] 1 5 9
> t(n)          #转置
     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    4    5    6
[3,]    7    8    9

三、列表

> a <- 1:20
> b <- matrix(1:20,4)
> c <- mtcars
> d <- "This is a list"
> a;b;c;d

> mlist <- list(a,b,c,d)

> mlist <- list(first=a,second=b,third=c,forth=d)

列表索引

> mlist[1]
$first
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

> mlist[c(1,4)]
$first
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

$forth
[1] "This is a list"

> mlist$first
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

> mlist[[5]] <- 2:8    #添加元素需要两个中括号

> mlist[[5]] <- NULL   #相当于清空这个元素的值

四、数据框

数据框是一种表格式的数据结构。数据框实质上是一个列表。列表中的元素是列表。数据框每一列必须同一类型。

> ?data.frame
> state <- data.frame(state.name,state.abb,state.region,state.x77)

>state[1]

> state[,"state.abb"]
 [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL"
[14] "IN" "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT"
[27] "NE" "NV" "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI"
[40] "SC" "SD" "TN" "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"

> state["Alabama",]
        state.name state.abb state.region Population Income Illiteracy
Alabama    Alabama        AL        South       3615   3624        2.1
        Life.Exp Murder HS.Grad Frost  Area
Alabama    69.05   15.1    41.3    20 50708

> state$Murder
 [1] 15.1 11.3  7.8 10.1 10.3  6.8  3.1  6.2 10.7 13.9  6.2  5.3 10.3
[14]  7.1  2.3  4.5 10.6 13.2  2.7  8.5  3.3 11.1  2.3 12.5  9.3  5.0
[27]  2.9 11.5  3.3  5.2  9.7 10.9 11.1  1.4  7.4  6.4  4.2  6.1  2.4
[40] 11.6  1.7 11.0 12.2  4.5  5.5  9.5  4.3  6.7  3.0  6.9

五、因子

由水平值构成的向量成为因子。因子用于分类

> mtcars$cyl
 [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4

> table(mtcars$cyl)   #对因子进行频数统计
 4  6  8 
11  7 14 

> f <- factor(c("red","red","green","blue"))
> f
[1] red   red   green blue 
Levels: blue green red

> week <- factor(c("Mon","Fri","Thu","Wed","Mon","Fri","Sun"),ordered = T,levels = c("Mon","Tue","Wed","Thu","Fri","Sat","Sun"))
> week
[1] Mon Fri Thu Wed Mon Fri Sun
Levels: Mon < Tue < Wed < Thu < Fri < Sat < Sun

六、缺失数据

在R中,NA代表缺失值,NA是不可用,用来存储缺失信息。
Inf存在,是无穷大或无穷小

> a <- c(NA,1:49)
> sum(a)
[1] NA
> mean(a)
[1] NA
> sum(a,na.rm = T)  #忽略缺失值
[1] 1225

> is.na(a)          #判断是否为空
 [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[45] FALSE FALSE FALSE FALSE FALSE FALSE


> c <- c(NA,1:20,NA,NA)
> c
 [1] NA  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 NA NA
> d <- na.omit(c)   #删除NA
> d
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
attr(,"na.action")
[1]  1 22 23
attr(,"class")
[1] "omit"

七、字符串

用r语言创建向量 r语言创建向量1到100_数据_11

> nchar("Hello world")   #字符串长度
[1] 11
> month.name
 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December" 
> nchar(month.name)
 [1] 7 8 5 5 3 4 4 6 9 7 8 8
> length(month.name)    #个数
[1] 12

> paste("Everybody","loves","states")   #拼接字符串
[1] "Everybody loves states"
> paste("Everybody","loves","states",sep="-")
[1] "Everybody-loves-states"
> names <- c("M","Y","X")
> paste(names,"love ststes")
[1] "M love ststes" "Y love ststes" "X love ststes"

> substr(x=month.name,start = 1,stop = 3)    #提取前三个字符
 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
> temp <- substr(x=month.name,start = 1,stop = 3)
> toupper(temp)        #转为大写
 [1] "JAN" "FEB" "MAR" "APR" "MAY" "JUN" "JUL" "AUG" "SEP" "OCT" "NOV"
[12] "DEC"
> tolower(temp)        #小写
 [1] "jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct" "nov"
[12] "dec"
> gsub("^(\\w)","\\U\\1",tolower(temp),perl = T)   #首字母大写
 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"


> x <- c("b","A+","AC")
> x
[1] "b"  "A+" "AC"
> grep("A+",x)             #查找字符
[1] 2 3
> grep("A+",x,fixed = T)    #fixed为T代表支持正则表达式,F代表不支持
[1] 2
> grep("A+",x,fixed = F)
[1] 2 3

> match("AC",x)   #字符串匹配
[1] 3


> path <- "/usr/local/bin/R"      #字符串分割
> strsplit(path,"/")
[[1]]
[1] ""      "usr"   "local" "bin"   "R"    

> strsplit(c(path,path),"/")
[[1]]
[1] ""      "usr"   "local" "bin"   "R"    

[[2]]
[1] ""      "usr"   "local" "bin"   "R"

八、日期和时间

  • 对事件序列的描述
  • 利用前面的结果进行预测

Sys.Date() 查看系统当前日期

>  Sys.Date()
[1] "2020-09-15"

as.Date(a,format = “%Y-%m-%d”) 转为日期型

> a <- "2017-01-01"
> b <- as.Date(a,format = "%Y-%m-%d")
> class(a)
[1] "character"
> class(b)
[1] "Date"

seq(as.Date(“2017-01-01”),as.Date(“2017-07-05”),by=5) 生成时间序列,间隔5天

> seq(as.Date("2017-01-01"),as.Date("2017-07-05"),by=5)
 [1] "2017-01-01" "2017-01-06" "2017-01-11" "2017-01-16" "2017-01-21"
 [6] "2017-01-26" "2017-01-31" "2017-02-05" "2017-02-10" "2017-02-15"
[11] "2017-02-20" "2017-02-25" "2017-03-02" "2017-03-07" "2017-03-12"
[16] "2017-03-17" "2017-03-22" "2017-03-27" "2017-04-01" "2017-04-06"
[21] "2017-04-11" "2017-04-16" "2017-04-21" "2017-04-26" "2017-05-01"
[26] "2017-05-06" "2017-05-11" "2017-05-16" "2017-05-21" "2017-05-26"
[31] "2017-05-31" "2017-06-05" "2017-06-10" "2017-06-15" "2017-06-20"
[36] "2017-06-25" "2017-06-30" "2017-07-05"

ts(sales,start = c(2010,5),end = c(2014,4),frequency = 1)

frequency 值为1时代表以年为单位,值为4时代表以季度为单位,值为12时代表以月份为单位

> sales <- round(runif(48,min=50,max=100))
> sales
 [1]  90  51  88  89  83  73  82  63  73  90  74  71 100  78  96  59  57
[18]  71  99  72  58  75  91  59  63  57  85  50  64  65  90  83  54  71
[35]  69  66  64  84  84  52  73  51  67  80  84  69  54  93
> ts(sales,start = c(2010,5),end = c(2014,4),frequency = 1)
Time Series:
Start = 2014 
End = 2017 
Frequency = 1 
[1] 90 51 88 89
> ts(sales,start = c(2010,5),end = c(2014,4),frequency = 4)
     Qtr1 Qtr2 Qtr3 Qtr4
2011   90   51   88   89
2012   83   73   82   63
2013   73   90   74   71
2014  100   78   96   59
> ts(sales,start = c(2010,5),end = c(2014,4),frequency = 12)
     Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2010                  90  51  88  89  83  73  82  63
2011  73  90  74  71 100  78  96  59  57  71  99  72
2012  58  75  91  59  63  57  85  50  64  65  90  83
2013  54  71  69  66  64  84  84  52  73  51  67  80
2014  84  69  54  93

)
Time Series:
Start = 2014
End = 2017
Frequency = 1
[1] 90 51 88 89

ts(sales,start = c(2010,5),end = c(2014,4),frequency = 4)
Qtr1 Qtr2 Qtr3 Qtr4
2011 90 51 88 89
2012 83 73 82 63
2013 73 90 74 71
2014 100 78 96 59
ts(sales,start = c(2010,5),end = c(2014,4),frequency = 12)
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2010 90 51 88 89 83 73 82 63
2011 73 90 74 71 100 78 96 59 57 71 99 72
2012 58 75 91 59 63 57 85 50 64 65 90 83
2013 54 71 69 66 64 84 84 52 73 51 67 80
2014 84 69 54 93