1. word,str_sub:字符串提取
library(stringr)
#packageVersion('stringr')
#ls("package:stringr")
### 1.word,str_sub:字符串提取
ids <-c("A.4.35","E.1.32","N.2.5")
ids_short <- word(ids,1,sep=fixed(".")) # 提取点号前面的部分
ids_short <- word(ids,2,sep=fixed(".")) # 提取点号后面的部分
ids_short <- word(ids,-1,sep=fixed(".")) # 点号分隔,提取最后一部分
ids_short <- word(ids,2,3,sep=fixed(".")) # 点号分隔,提取第二三部分
x <- "BBCDEF"
str_sub(x,1,3) # 提取
str_sub(x, 2, -2)
str_sub(x,1,2) <-"AA" # 替换
hw <- "Hadley Wickham"
str_sub(hw, c(1, 8), c(3, 11)) # [1] "Had" "Wick"
2. str_split:字符串分隔
### 2. str_split:字符串分隔
str_split(a,'\\.') # 以点号分隔,转义符 \\
str_split(a,'[.]') # 以点号分隔
str_split(a,'[5.a]') # 多个分隔符
# [[a-z][A-Z][0-9]],
# [a-zA-Z0-9]
# [abc]
# [^abc]
# [\\P{Letter}]
# [\\p{Letter}]
str_split(a,'[\\p{Letter}]') # 以字母分隔
str_split(a,'[\\P{Letter}]') # 以字母以外的字符分隔
str_split(a,'[^abc]') # 以字母abc以外的字符分隔
fruits <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
str_split_fixed(fruits, " and ", 3)
str_split_fixed(fruits, " and ", 4)
# str_split 返回列表
# str_split_fixed 返回matrix
3. str_trim:去掉字符串的空格和TAB(\t)
### 3 str_trim:去掉字符串的空格和TAB(\t)
# str_trim(string, side = c(“both”, “left”, “right”))
mystr <- " AABBBBCCC "
str_trim(mystr, side = 'left')
str_trim(mystr, side = 'right')
str_trim(mystr, side = 'both')
4. str_pad:补充字符串的长度
### 4. str_pad:补充字符串的长度
str_pad("separation",20,side = 'both',pad = '*')
str_pad(c("a", "abc", "abcdef"), 10,pad = '*')
# side :left/right/both
5. str_dup: 复制字符串
### 5. str_dup: 复制字符串
fruit <- c("apple", "pear", "banana")
str_dup(fruit, 2) # 都复制二次
str_dup(fruit, 1:3) # 分别重复一二三次
6. str_subset: 返回匹配的字符串
### 6 str_subset: 返回匹配的字符串
fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(fruit, "pi") # 返回字符串
str_subset(fruit, "[eu]")
str_which(fruit, "pi") # 返回索引
str_which(fruit, "[eu]")
7. str_count:字符串计数
###7.str_count:字符串计数
mstr<-c('ttccaggact','ggccgctaatccc')
str_count(mstr) # 每个字符串总长度
str_count(mstr,'t') # 每个字符串中t出现的次数
str_length(mstr) # 每个字符串总长度
8. str_sort:字符串值排序
### 8. str_sort:字符串值排序
mstr <- c("B123","A234","D789")
str_sort(mstr)
str_sort(mstr,decreasing=TRUE)
str_order(mstr)
str_order(letters)
str_sort(letters)
str_sort(LETTERS)
x <- c("100a1", "10a5", NA,"2b", "2a")
str_sort(x)
str_sort(x, numeric = TRUE) # 按数值排序
str_sort(x, numeric = TRUE,decreasing = FALSE, na_last = TRUE)
str_sort(x, numeric = TRUE,decreasing = FALSE, na_last = FALSE)
9. str_match:字符串匹配
### 9. str_match:字符串匹配
## str_match函数返回matrix,str_match_all函数返回list
val <- c("abc", 123, "cbaa") # 从字符串中提取匹配组
str_match(val, "a") # 匹配字符a,并返回对应的字符
str_match(val, "[0-9]") # 匹配字符0-9,限1个,并返回对应的字符
str_match(val, "[0-9]*") # 匹配字符0-9,不限数量,并返回对应的字符
str_match_all(val, "a") # 多次提取,可以确定每个字符串中含有几个a
str_match_all(val, "[0-9]")
10. str_replace:字符串替换
###10. str_replace:字符串替换
val <- c("abc", 123, "cba",NA)
str_replace(val, "[ab]", "-") #只替换第一个匹配的字符
str_replace_all(val, "[ab]", "-") #替换所有匹配的字符
str_replace_all(val, "[a]", "\1") # 把目标字符串所有出现的a,替换为被转义的字符
str_replace_na(c(NA, "abc", "def")) # Turn NA into "NA"
str_replace_na(val)
11. str_locate 字符串模式位置查找
### 11. str_locate 字符串模式位置查找
#str_locate:只匹配首次,返回matrix
#str_locate_all:匹配所有可能的值,返回list
val <- c("aabcdaa", 123, "cbaaccddaaa")
str_locate(val, "aa")
str_locate_all(val, "aa")
12. str_extract 提取指定模式的字符串
### 12. str_extract 提取指定模式的字符串
shopping_list <- c("apples x44", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d") # 提取数字,只提取第一个出现的数字
str_extract_all(shopping_list, "\\d") # 提取所有的数字,列表保存
str_extract(shopping_list, "[a-z]+") # 提取字母,只提取第一个,返回向量
str_extract_all(shopping_list, "[a-z]+") # 提取字母,多次提取,返回list
str_extract(shopping_list, "[a-z]{1,4}")
str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
13. str_c:字符串合并
### 13.str_c:字符串合并
fruit <- c("apple","banana","pear","orange")
str_c(1:4,fruit,sep='-')
# paste(1:4,fruit,sep='-')
str_c("Letter", letters, sep = ": ")
# paste("Letter", letters, sep = ": ")
str_c(letters, collapse = ", ") # 逗号把输入向量连成一个字符串
#paste(letters, collapse = ", ")
stringr
包中的重要函数
函数 | 功能说明 | R Base中对应函数 |
使用正则表达式的函数 | ||
| 提取首个匹配模式的字符 |
|
| 提取所有匹配模式的字符 |
|
| 返回首个匹配模式的字符的位置 |
|
| 返回所有匹配模式的字符的位置 |
|
| 替换首个匹配模式 |
|
| 替换所有匹配模式 |
|
| 按照模式分割字符串 |
|
| 按照模式将字符串分割成指定个数 | - |
| 检测字符是否存在某些指定模式 |
|
| 返回指定模式出现的次数 | - |
其他重要函数 | ||
| 提取指定位置的字符 |
|
| 丢弃指定位置的字符 | - |
| 返回字符的长度 |
|
| 填补字符 | - |
| 丢弃填充,如去掉字符前后的空格 | - |
| 连接字符 |
|