Eloquent JavaScript #09# Regular Expressions

转载

mb5ff40d0fc970b 2018-09-02 12:32:00

索引

Notes
Exercise

Notes

1、正则表达式帮助我们在字符串中寻找特定模式。

js创建正则表达式的两种等价写法：

let re1 = new RegExp("abc");
let re2 = /abc/;

2、应用正则表达式

console.log(/abc/.test("abcde"));
// → true
console.log(/abc/.test("abxde"));
// → false

3、字符集合

`\d`	Any digit character
`\w`	An alphanumeric character (“word character”)
`\s`	Any whitespace character (space, tab, newline, and similar)
`\D`	A character that is not a digit
`\W`	A nonalphanumeric character
`\S`	A nonwhitespace character
`.`	Any character except for newline

`/abc/`	A sequence of characters
`/[abc]/`	Any character from a set of characters
`/[^abc]/`	Any character not in a set of characters
`/[0-9]/`	Any character in a range of characters
`/x+/`	One or more occurrences of the pattern `x`
`/x+?/`	One or more occurrences, nongreedy
`/x*/`	Zero or more occurrences
`/x?/`	Zero or one occurrence
`/x{2,4}/`	Two to four occurrences
`/(abc)/`	A group
`/a\|b\|c/`	Any one of several patterns
`/\d/`	Any digit character
`/\w/`	An alphanumeric character (“word character”)
`/\s/`	Any whitespace character
`/./`	Any character except newlines
`/\b/`	A word boundary
`/^/`	Start of input
`/$/`	End of input

\d等转移字符可以放在 [ ] 里而不丧失含义，但是 . 和+ 之类的特殊符号不行，会变为普通的符号。

整体取反，非0非1：

let notBinary = /[^01]/;
console.log(notBinary.test("1100100010100110"));
// → false
console.log(notBinary.test("1100100010200110"));
// → true

4、重复匹配

+ one or more，* zero or more

console.log(/'\d+'/.test("'123'"));
// → true
console.log(/'\d+'/.test("''"));
// → false
console.log(/'\d*'/.test("'123'"));
// → true
console.log(/'\d*'/.test("''"));
// → true

? zero or one

let neighbor = /neighbou?r/;
console.log(neighbor.test("neighbour"));
// → true
console.log(neighbor.test("neighbor"));
// → true

{2} a pattern should occur a precise number of times，It is also possible to specify a range this way: {2,4} means the element must occur at least twice and at most four times.

let dateTime = /\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{2}/;
console.log(dateTime.test("1-30-2003 8:45"));
// → true

You can also specify open-ended ranges when using braces by omitting the number after the comma. So, {5,} means five or more times.

5、分组（子表达式）

括号内的n个元素被视作一个整体元素（分组，子表达式）：

let cartoonCrying = /boo+(hoo+)+/i;
console.log(cartoonCrying.test("Boohoooohoohooo"));
// → true

i表示该表达式大小写不敏感。

6、进行正则匹配的另外一种方式

可以让我们获取额外的信息：

let match = /\d+/.exec("one two 100");
console.log(match);
// → ["100"]
console.log(match.index);
// → 8

exec的返回值：匹配失败为null，成功则如上所示。

等价写法：

console.log("one two 100".match(/\d+/));
// → ["100"]

含括号表达式的情况：

let quotedText = /'([^']*)'/;
console.log(quotedText.exec("she said 'hello'"));
// → ["'hello'", "hello"]

console.log(/bad(ly)?/.exec("bad"));
// → ["bad", undefined]
console.log(/(\d)+/.exec("123"));
// → ["123", "3"]

返回数组的第一个元素为整个正则表达式匹配的字符串，而第二元素为() 内正则（子表达式）匹配的字符串（没有就是undefined，多个就取最后一个）。容易知道，第二个元素几乎总是第一个元素的子集。

7、The Date class

console.log(new Date());
// → Sat Sep 01 2018 13:54:43 GMT+0800 (中国标准时间)

console.log(new Date(2009, 11, 9));
// → Wed Dec 09 2009 00:00:00 GMT+0800 (中国标准时间)
console.log(new Date(2009, 11, 9, 12, 59, 59, 999));
// → Wed Dec 09 2009 12:59:59 GMT+0800 (中国标准时间)

console.log(new Date(1997, 10, 19).getTime());
// → 879868800000
console.log(new Date(1387407600000));
// → Thu Dec 19 2013 07:00:00 GMT+0800 (中国标准时间)

console.log(new Date().getTime());
// → 1535781283593
console.log(Date.now());
// → 1535781283593

通过正则表达式，由String创建日期：

"use strict";

function getDate(string) {
  let [_, month, day, year] =
    /(\d{1,2})-(\d{1,2})-(\d{4})/.exec(string);
  return new Date(year, month - 1, day);
}
console.log(getDate("1-30-2003"));
// → Thu Jan 30 2003 00:00:00 GMT+0100 (CET)

PS. 下划线除了用来占位外没有其它含义。

8、强制匹配整个字符串

利用 ^ 和 $ 。例如/^\d+$/匹配完全由数字构成的字符串，/^!/ 匹配由！开头的字符串，而/x^/ 啥也匹配不了。

用 \b 标注单词边界：

console.log(/cat/.test("concatenate"));
// → true
console.log(/\bcat\b/.test("concatenate"));
// → false
console.log(/\bcat\b/.test("xx cat xx"));
// → true

9、Choice patterns

let animalCount = /\b\d+ (pig|cow|chicken)s?\b/;
console.log(animalCount.test("15 pigs"));
// → true
console.log(animalCount.test("15 pigchickens"));
// → false

10、正则匹配的机制

当你进行正则匹配时（test或者exec），正则引擎将从所给字符串的开头开始尝试匹配，接着是第二个字符，第三个字符... 试图在所给字符串中寻找一个匹配，直到找到一个匹配项或者到达字符串末尾结束。要么返回第一个匹配，要么什么都匹配不到。

/**
 * 模拟用正则\b\d+ (pig|cow|chicken)s?\b
 * 匹配"the 3 pigs"
 */

const str = "the 3 pigs";

function simulateRegex(str, start) {
    const digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
    // 逐个字符尝试匹配，直到找到一个匹配项或者到达字符串尾结束
    for(let currentPosition = start; currentPosition != str.length; ++currentPosition) {
        let tempPosition = currentPosition;
        if(tempPosition == 0 || str[tempPosition - 1] == " ") {} else continue;
        // 匹配单词边界通过，前面是标点也是可以的。。            
        if(!str[tempPosition++] in digits) continue;
        // 至少匹配一个数字通过
        while(str[tempPosition] in digits) {
            tempPosition++;
            // 尝试继续匹配数字
        }
        if(str[tempPosition++] != " ") continue;
        // 匹配一个空格通过
        let tempWord;
        if(str.slice(tempPosition, tempPosition + "pig".length) === (tempWord = "pig") ||
            str.slice(tempPosition, tempPosition + "cow".length) === (tempWord = "cow") ||
            str.slice(tempPosition, tempPosition + "chicken".length) === (tempWord = "chicken")) {
            tempPosition += tempWord.length;
        } else {
            continue;
        }
        // 单词匹配成功
        if(str[tempPosition] == "s") tempPosition++;
        // 有没s都可以
        if(tempPosition == str.length || str[tempPosition] == " ") {
            // 最后的单词边界
            let match = [str.slice(currentPosition, tempPosition + 1)];
            return match;
        }
    }
    return null;
}

let match = simulateRegex(str, 4);
console.log(match);
// → ["3 pigs"]

11、回溯Backtracking

正则引擎在进行分支匹配（|）或重复匹配（+ *）时，如果发现无法继续再继续往下匹配，就会进行“回溯”。

在进行分支匹配时，如果第一个分支就匹配成功，就不再匹配其它分支，如果不成功就会回溯到分支的入口，进入到另外一个分支继续匹配。

而进行重复匹配时，例如说/^.*x/用匹配"abcxe"，.*会首先把所有字符消费干净，当正则引擎发现最后还需要一个x时，*操作符会尝试少匹配一个字符，但是仍然没发现x，于是继续回溯，直到发现x，最终得到字符串abc。

12、The replace method

replace配合正则：

console.log("papa".replace("p", "m"));
// → mapa

console.log("Borobudur".replace(/[ou]/, "a"));
// → Barobudur
console.log("Borobudur".replace(/[ou]/g, "a")); // g代表global全部
// → Barabadar

replace的真正强大之处在于可以用“$数字”引用匹配字符串：

console.log(
  "Liskov, Barbara\nMcCarthy, John\nWadler, Philip"
    .replace(/(\w+), (\w+)/g, "$2 $1"));
// → Barbara Liskov
//   John McCarthy
//   Philip Wadler


"hello, word, every, one".replace(/(\w+),/g, "$1 "); // “$+数字”引用匹配中的分组
// → "hello  word  every  one"
"hello, word, every, one".replace(/one/g, "$& $&"); // “$&”引用整个匹配
// → "hello, word, every, one one"

还可以传入函数：

"hello, word, every, one".replace(/(\w+),/g, str => str.toUpperCase()); 
// → "HELLO, WORD, EVERY, one"

let stock = "1 lemon, 2 cabbages, and 101 eggs";
function minusOne(match, amount, unit) {
  amount = Number(amount) - 1;
  if (amount == 1) { // only one left, remove the 's'
    unit = unit.slice(0, unit.length - 1);
  } else if (amount == 0) {
    amount = "no";
  }
  return amount + " " + unit;
}
console.log(stock.replace(/(\d+) (\w+)/g, minusOne));
// → no lemon, 1 cabbage, and 100 eggs

13、贪婪Greed

function stripComments(code) {
  return code.replace(/\/\/.*|\/\*[^]*\*\//g, "");
}
console.log(stripComments("1 + /* 2 */3"));
// → 1 + 3
console.log(stripComments("x = 10;// ten!"));
// → x = 10;
console.log(stripComments("1 /* a */+/* b */ 1"));
// → 1  1

可以用replace来去掉代码中的所有注释。

[^]可以匹配任何字符，因为 /**/可能跨多行，句点 . 无法匹配换行符号。

然而上面最后一行代码结果却出错了，这是为什么呢？

因为(+, *, ?, and {}) 这些操作符号都是贪婪的，就像“回溯”里面提到的，它们总是先尽可能地消费字符，直到无路可走才会回头，这样理所当然会匹配到更长的那一个。解决方案就是在这些符号后面加问号 (+?, *?, ??, {}?)，这样它们就会匹配尽可能少的字符串。

function stripComments(code) {
  return code.replace(/\/\/.*|\/\*[^]*?\*\//g, "");
}
console.log(stripComments("1 /* a */+/* b */ 1"));
// → 1 + 1

当要用到重复匹配符时，先考虑用非贪婪版本的。

14、动态构建正则表达式

利用new RegExp(拼接字符串， "gi")构建，gi表示global替换全部和大小写不敏感。

let name = "harry";
let text = "Harry is a suspicious character.";
let regexp = new RegExp("\\b(" + name + ")\\b", "gi");
console.log(text.replace(regexp, "_$1_"));
// → _Harry_ is a suspicious character.

let name = "dea+hl[]rd";
let text = "This dea+hl[]rd guy is super annoying.";
let escaped = name.replace(/[\\[.+*?(){|^$]/g, "\\$&");
// escaped → "dea\+hl\[]rd"
let regexp = new RegExp("\\b" + escaped + "\\b", "gi");
console.log(text.replace(regexp, "_$&_"));
// → This _dea+hl[]rd_ guy is super annoying.

15、Search

正则版indexof：

console.log("  word".search(/\S/));
// → 2
console.log("    ".search(/\S/));
// → -1

16、The lastIndex property

需求：设置从字符串的某个字符开始匹配

问题：没有方便的办法

理由：不方便正是js的特性。。。。

解决方案：在【严格的条件】下用lastIndex设定起始位置

严格的条件：表达式必须开启g（global）或者s（sticky）选项，并且必须通过exec方式执行匹配。

lastIndex：正则对象的一个属性，数字，决定了下一个匹配从第几个字符开始。在严格条件 ↑ 下设定才有效。非严格条件下改变该值是毫无作用的。

let pattern = /y/g;
pattern.lastIndex = 3;
let match = pattern.exec("xyzzy");
console.log(match.index);
// → 4
console.log(pattern.lastIndex);
// → 5

仅global：匹配成功，自动更新lastIndex为匹配成功位置的下一个位置（如上），匹配失败，lastIndex重新设置为0。

global：从str[lastIndex]开始向后搜索匹配

sticky：从str[lastIndex]直接开始匹配，不向后搜索。

let global = /abc/g;
console.log(global.exec("xyz abc"));
// → ["abc"]
let sticky = /abc/y;
console.log(sticky.exec("xyz abc"));
// → null

所以只需简单调整一下lastIndex就可以让上面成功的失败、失败的成功：

let global = /abc/g;
global.lastIndex = 6; // 从c开始向后搜索匹配
console.log(global.exec("xyz abc"));
// → null
let sticky = /abc/y;
sticky.lastIndex = 4; // 从a开始匹配
console.log(sticky.exec("xyz abc"));
// → ["abc"]

因为在global启用时，LastIndex在匹配完之后是要自动更新的，所以，当用一个正则对象匹配多次的时候就会出现坑爹的结果：

let digit = /\d/g;
console.log(digit.exec("here it is: 1"));
// → ["1"]
console.log(digit.exec("and now: 1"));
// → null

在s启用，或者啥也不启用时不会有这方面的顾虑。

global的另外一方面影响在于，它改变了match的行为：

console.log("Banana".match(/an/g));
// → ["an", "an"]
console.log(/an/g.exec("Banana"));
// → ["an", index: 1, input: "Banana", groups: undefined] 
// global改变了match的行为，本来上述两个
// 输出应该相同的（等价操作），而且["an", "an"]
// 后者本应该是子表达式匹配的字符串，前者的子集

总结。。慎用global

17、遍历匹配项

利用global模式下的lastIndex机制应该是最简便的方法。

let input = "A string with 3 numbers in it... 42 and 88.";
let number = /\b\d+\b/g;
let match;
while (match = number.exec(input)) {
  console.log("Found", match[0], "at", match.index);
}
// → Found 3 at 14
//   Found 42 at 33
//   Found 88 at 40

18、解析INI文件

function parseINI(string) {
    // Start with an object to hold the top-level fields
    let result = {};
    let section = result;
    string.split(/\r?\n/).forEach(line => {
        let match;
        if(match = line.match(/^(\w+)=(.*)$/)) {
            section[match[1]] = match[2];
        } else if(match = line.match(/^\[(.*)\]$/)) {
            section = result[match[1]] = {};
        } else if(!/^\s*(;.*)?$/.test(line)) {
            throw new Error("Line '" + line + "' is not valid.");
        }
    });
    return result;
}

console.log(parseINI(`
searchengine=https://duckduckgo.com/?q=$1
spitefulness=9.7

; comments are preceded by a semicolon...
; each section concerns an individual enemy
[larry]
fullname=Larry Doe
type=kindergarten bully
website=http://www.geocities.com/CapeCanaveral/11451

[davaeorn]
fullname=Davaeorn
type=evil wizard
outputdir=/home/marijn/enemies/davaeorn`));
// → davaeorn:  { fullname: "Davaeorn", type: "evil wizard", outputdir: "/home/marijn/enemies/davaeorn" }
// larry:  { fullname: "Larry Doe", type: "kindergarten bully", website: "http://www.geocities.com/CapeCanaveral/11451" }
// searchengine: "https://duckduckgo.com/?q=$1"
// spitefulness: "9.7"

19、国际字符

console.log(/????{3}/.test("????????????"));
// → false
console.log(/<.>/.test("<????>"));
// → false
console.log(/<.>/u.test("<????>"));
// → true

????可以视为两个字符，????{3} 后面的量词实际针对的是构成????的第二个字符，解决方法是在正则后添加u（for Unicode）。然而这可能导致原有的匹配出现问题。

因此，需要在添加u的前提下，继续添加\p{Property=Value}：

console.log(/\p{Script=Greek}/u.test("α"));
// → true
console.log(/\p{Script=Arabic}/u.test("α"));
// → false
console.log(/\p{Alphabetic}/u.test("α"));
// → true
console.log(/\p{Alphabetic}/u.test("!"));
// → false

Exercises

① Regexp golf

// Fill in the regular expressions

verify(/ca[rt]/,
       ["my car", "bad cats"],
       ["camper", "high art"]);

verify(/pr?op/,
       ["pop culture", "mad props"],
       ["plop", "prrrop"]);

verify(/ferr(et|y|ari)/,
       ["ferret", "ferry", "ferrari"],
       ["ferrum", "transfer A"]);

verify(/ious\b/,
       ["how delicious", "spacious room"],
       ["ruinous", "consciousness"]);

verify(/\s[.,:;]/,
       ["bad punctuation ."],
       ["escape the period"]);

verify(/\w{7}/,
       ["hottentottententen"],
       ["no", "hotten totten tenten"]);

verify(/\b[^\We]+\b/i,
       ["red platypus", "wobbling nest"],
       ["earth bed", "learning ape", "BEET"]);


function verify(regexp, yes, no) {
  // Ignore unfinished exercises
  if (regexp.source == "...") return;
  for (let str of yes) if (!regexp.test(str)) {
    console.log(`Failure to match '${str}'`);
  }
  for (let str of no) if (regexp.test(str)) {
    console.log(`Unexpected match for '${str}'`);
  }
}

-—————— -- -——-—— -- - -----————------------ -- -- -- - -- —

② Quoting style

let text = "'I'm the cook,' he said, 'it's my job.'";
// Change this call.
console.log(text.replace(/'|([\w]'[\w])/g, str => str == "'" ? '"' : str));
// → "I'm the cook," he said, "it's my job."

课本解答：

let text = "'I'm the cook,' he said, 'it's my job.'";

console.log(text.replace(/(^|\W)'|'(\W|$)/g, '$1"$2'));
// → "I'm the cook," he said, "it's my job."

-—————— -- -——-—— -- - -----————------------ -- -- -- - -- —

③ Numbers again

// Fill in this regular expression.
let number = /^[+-]?(\d+\.?\d*|\d*\.?\d+)([eE][+-]?\d+)?$/;

// Tests:
for (let str of ["1", "-1", "+15", "1.55", ".5", "5.",
                 "1.3e2", "1E-4", "1e+12"]) {
  if (!number.test(str)) {
    console.log(`Failed to match '${str}'`);
  }
}
for (let str of ["1a", "+-1", "1.2.3", "1+1", "1e4.5",
                 ".5.", "1f5", "."]) {
  if (number.test(str)) {
    console.log(`Incorrectly accepted '${str}'`);
  }
}

课本答案（-号最好转义？）：

let number = /^[+\-]?(\d+(\.\d*)?|\.\d+)([eE][+\-]?\d+)?$/;

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：JavaScript笔记 #05# 用Regex辅助生成文章目录

下一篇：Eloquent JavaScript #08# Bugs and Errors

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯