1. word,str_sub:字符串提取

library(stringr)
#packageVersion('stringr')
#ls("package:stringr")

### 1.word,str_sub:字符串提取
ids <-c("A.4.35","E.1.32","N.2.5")
ids_short <- word(ids,1,sep=fixed("."))  # 提取点号前面的部分
ids_short <- word(ids,2,sep=fixed("."))  # 提取点号后面的部分
ids_short <- word(ids,-1,sep=fixed(".")) # 点号分隔,提取最后一部分
ids_short <- word(ids,2,3,sep=fixed(".")) # 点号分隔,提取第二三部分

x <- "BBCDEF"
str_sub(x,1,3)  # 提取
str_sub(x, 2, -2)
str_sub(x,1,2) <-"AA" # 替换

hw <- "Hadley Wickham"
str_sub(hw, c(1, 8), c(3, 11)) # [1] "Had"  "Wick"

2. str_split:字符串分隔

### 2. str_split:字符串分隔

str_split(a,'\\.')  # 以点号分隔,转义符 \\
str_split(a,'[.]')  # 以点号分隔
str_split(a,'[5.a]') # 多个分隔符

# [[a-z][A-Z][0-9]], 
# [a-zA-Z0-9]
# [abc]
# [^abc]
# [\\P{Letter}]
# [\\p{Letter}]

str_split(a,'[\\p{Letter}]')  # 以字母分隔
str_split(a,'[\\P{Letter}]')  # 以字母以外的字符分隔
str_split(a,'[^abc]') # 以字母abc以外的字符分隔

fruits <- c(
  "apples and oranges and pears and bananas",
  "pineapples and mangos and guavas"
)

str_split_fixed(fruits, " and ", 3)
str_split_fixed(fruits, " and ", 4)

# str_split 返回列表
# str_split_fixed 返回matrix

3. str_trim:去掉字符串的空格和TAB(\t)

### 3 str_trim:去掉字符串的空格和TAB(\t)
# str_trim(string, side = c(“both”, “left”, “right”))
mystr <- " AABBBBCCC "
str_trim(mystr, side = 'left')
str_trim(mystr, side = 'right')
str_trim(mystr, side = 'both')

4. str_pad:补充字符串的长度

### 4. str_pad:补充字符串的长度
str_pad("separation",20,side = 'both',pad = '*')
str_pad(c("a", "abc", "abcdef"), 10,pad = '*')
# side :left/right/both

5. str_dup: 复制字符串

### 5. str_dup: 复制字符串
fruit <- c("apple", "pear", "banana")
str_dup(fruit, 2) # 都复制二次
str_dup(fruit, 1:3) # 分别重复一二三次

6. str_subset: 返回匹配的字符串

### 6 str_subset: 返回匹配的字符串
fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(fruit, "pi")  # 返回字符串
str_subset(fruit, "[eu]")
str_which(fruit, "pi") # 返回索引
str_which(fruit, "[eu]")

7. str_count:字符串计数

###7.str_count:字符串计数
mstr<-c('ttccaggact','ggccgctaatccc')
str_count(mstr) # 每个字符串总长度
str_count(mstr,'t') # 每个字符串中t出现的次数

str_length(mstr) # 每个字符串总长度

8. str_sort:字符串值排序

### 8. str_sort:字符串值排序
mstr <- c("B123","A234","D789")
str_sort(mstr)
str_sort(mstr,decreasing=TRUE)
str_order(mstr)
str_order(letters)
str_sort(letters)
str_sort(LETTERS)

x <- c("100a1", "10a5", NA,"2b", "2a")
str_sort(x)
str_sort(x, numeric = TRUE)  # 按数值排序
str_sort(x, numeric = TRUE,decreasing = FALSE, na_last = TRUE)
str_sort(x, numeric = TRUE,decreasing = FALSE, na_last = FALSE)

9. str_match:字符串匹配

### 9. str_match:字符串匹配
## str_match函数返回matrix,str_match_all函数返回list

val <- c("abc", 123, "cbaa") # 从字符串中提取匹配组
str_match(val, "a") # 匹配字符a,并返回对应的字符
str_match(val, "[0-9]") # 匹配字符0-9,限1个,并返回对应的字符
str_match(val, "[0-9]*") # 匹配字符0-9,不限数量,并返回对应的字符

str_match_all(val, "a") # 多次提取,可以确定每个字符串中含有几个a
str_match_all(val, "[0-9]")

10. str_replace:字符串替换

###10. str_replace:字符串替换
val <- c("abc", 123, "cba",NA)
str_replace(val, "[ab]", "-") #只替换第一个匹配的字符
str_replace_all(val, "[ab]", "-") #替换所有匹配的字符 
str_replace_all(val, "[a]", "\1") # 把目标字符串所有出现的a,替换为被转义的字符

str_replace_na(c(NA, "abc", "def")) # Turn NA into "NA"
str_replace_na(val)

11. str_locate 字符串模式位置查找

### 11. str_locate 字符串模式位置查找
#str_locate:只匹配首次,返回matrix
#str_locate_all:匹配所有可能的值,返回list

val <- c("aabcdaa", 123, "cbaaccddaaa")
str_locate(val, "aa")
str_locate_all(val, "aa")

12. str_extract 提取指定模式的字符串

### 12. str_extract 提取指定模式的字符串
shopping_list <- c("apples x44", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d") # 提取数字,只提取第一个出现的数字
str_extract_all(shopping_list, "\\d") # 提取所有的数字,列表保存
str_extract(shopping_list, "[a-z]+") # 提取字母,只提取第一个,返回向量
str_extract_all(shopping_list, "[a-z]+") # 提取字母,多次提取,返回list
str_extract(shopping_list, "[a-z]{1,4}")
str_extract(shopping_list, "\\b[a-z]{1,4}\\b")

13. str_c:字符串合并

### 13.str_c:字符串合并
fruit <- c("apple","banana","pear","orange")
str_c(1:4,fruit,sep='-')
# paste(1:4,fruit,sep='-')

str_c("Letter", letters, sep = ": ")
# paste("Letter", letters, sep = ": ")

str_c(letters, collapse = ", ") # 逗号把输入向量连成一个字符串
#paste(letters, collapse = ", ")

stringr包中的重要函数

函数

功能说明

R Base中对应函数

使用正则表达式的函数

str_extract()

提取首个匹配模式的字符

regmatches()

str_extract_all()

提取所有匹配模式的字符

regmatches()

str_locate()

返回首个匹配模式的字符的位置

regexpr()

str_locate_all()

返回所有匹配模式的字符的位置

gregexpr()

str_replace()

替换首个匹配模式

sub()

str_replace_all()

替换所有匹配模式

gsub()

str_split()

按照模式分割字符串

strsplit()

str_split_fixed()

按照模式将字符串分割成指定个数

-

str_detect()

检测字符是否存在某些指定模式

grepl()

str_count()

返回指定模式出现的次数

-

其他重要函数

str_sub()

提取指定位置的字符

regmatches()

str_dup()

丢弃指定位置的字符

-

str_length()

返回字符的长度

nchar()

str_pad()

填补字符

-

str_trim()

丢弃填充,如去掉字符前后的空格

-

str_c()

连接字符

paste(),paste0()