mirror of
https://github.com/whoisclebs/lodash.git
synced 2026-02-10 10:57:49 +00:00
Implement asciiWords and unicodeWords using String.match instead of binding RegExp.exec (#4417)
* Enable words tests * Implement asciiWords and unicodeWords using String.match instead of binding RegExp.exec
This commit is contained in:
committed by
John-David Dalton
parent
15e1557b2a
commit
e2555a43ad
@@ -45,14 +45,7 @@ const rsOrdUpper = '\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])'
|
|||||||
const rsSeq = rsOptVar + reOptMod + rsOptJoin
|
const rsSeq = rsOptVar + reOptMod + rsOptJoin
|
||||||
const rsEmoji = `(?:${[rsDingbat, rsRegional, rsSurrPair].join('|')})${rsSeq}`
|
const rsEmoji = `(?:${[rsDingbat, rsRegional, rsSurrPair].join('|')})${rsSeq}`
|
||||||
|
|
||||||
/**
|
const reUnicodeWords = RegExp([
|
||||||
* Splits a Unicode `string` into an array of its words.
|
|
||||||
*
|
|
||||||
* @private
|
|
||||||
* @param {string} The string to inspect.
|
|
||||||
* @returns {Array} Returns the words of `string`.
|
|
||||||
*/
|
|
||||||
const unicodeWords = RegExp.prototype.exec.bind(RegExp([
|
|
||||||
`${rsUpper}?${rsLower}+${rsOptContrLower}(?=${[rsBreak, rsUpper, '$'].join('|')})`,
|
`${rsUpper}?${rsLower}+${rsOptContrLower}(?=${[rsBreak, rsUpper, '$'].join('|')})`,
|
||||||
`${rsMiscUpper}+${rsOptContrUpper}(?=${[rsBreak, rsUpper + rsMiscLower, '$'].join('|')})`,
|
`${rsMiscUpper}+${rsOptContrUpper}(?=${[rsBreak, rsUpper + rsMiscLower, '$'].join('|')})`,
|
||||||
`${rsUpper}?${rsMiscLower}+${rsOptContrLower}`,
|
`${rsUpper}?${rsMiscLower}+${rsOptContrLower}`,
|
||||||
@@ -61,6 +54,17 @@ const unicodeWords = RegExp.prototype.exec.bind(RegExp([
|
|||||||
rsOrdLower,
|
rsOrdLower,
|
||||||
`${rsDigit}+`,
|
`${rsDigit}+`,
|
||||||
rsEmoji
|
rsEmoji
|
||||||
].join('|'), 'g'))
|
].join('|'), 'g')
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits a Unicode `string` into an array of its words.
|
||||||
|
*
|
||||||
|
* @private
|
||||||
|
* @param {string} The string to inspect.
|
||||||
|
* @returns {Array} Returns the words of `string`.
|
||||||
|
*/
|
||||||
|
function unicodeWords(string) {
|
||||||
|
return string.match(reUnicodeWords)
|
||||||
|
}
|
||||||
|
|
||||||
export default unicodeWords
|
export default unicodeWords
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import assert from 'assert';
|
import assert from 'assert';
|
||||||
import lodashStable from 'lodash';
|
import lodashStable from 'lodash';
|
||||||
import { burredLetters, _, stubArray } from './utils.js';
|
import { burredLetters, _, stubArray } from './utils.js';
|
||||||
|
import words from '../words.js'
|
||||||
|
|
||||||
describe('words', function() {
|
describe('words', function() {
|
||||||
it('should match words containing Latin Unicode letters', function() {
|
it('should match words containing Latin Unicode letters', function() {
|
||||||
@@ -9,36 +10,36 @@ describe('words', function() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
var actual = lodashStable.map(burredLetters, function(letter) {
|
var actual = lodashStable.map(burredLetters, function(letter) {
|
||||||
return _.words(letter);
|
return words(letter);
|
||||||
});
|
});
|
||||||
|
|
||||||
assert.deepStrictEqual(actual, expected);
|
assert.deepStrictEqual(actual, expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should support a `pattern`', function() {
|
it('should support a `pattern`', function() {
|
||||||
assert.deepStrictEqual(_.words('abcd', /ab|cd/g), ['ab', 'cd']);
|
assert.deepStrictEqual(words('abcd', /ab|cd/g), ['ab', 'cd']);
|
||||||
assert.deepStrictEqual(_.words('abcd', 'ab|cd'), ['ab']);
|
assert.deepStrictEqual(Array.from(words('abcd', 'ab|cd')), ['ab']);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should work with compound words', function() {
|
it('should work with compound words', function() {
|
||||||
assert.deepStrictEqual(_.words('12ft'), ['12', 'ft']);
|
assert.deepStrictEqual(words('12ft'), ['12', 'ft']);
|
||||||
assert.deepStrictEqual(_.words('aeiouAreVowels'), ['aeiou', 'Are', 'Vowels']);
|
assert.deepStrictEqual(words('aeiouAreVowels'), ['aeiou', 'Are', 'Vowels']);
|
||||||
assert.deepStrictEqual(_.words('enable 6h format'), ['enable', '6', 'h', 'format']);
|
assert.deepStrictEqual(words('enable 6h format'), ['enable', '6', 'h', 'format']);
|
||||||
assert.deepStrictEqual(_.words('enable 24H format'), ['enable', '24', 'H', 'format']);
|
assert.deepStrictEqual(words('enable 24H format'), ['enable', '24', 'H', 'format']);
|
||||||
assert.deepStrictEqual(_.words('isISO8601'), ['is', 'ISO', '8601']);
|
assert.deepStrictEqual(words('isISO8601'), ['is', 'ISO', '8601']);
|
||||||
assert.deepStrictEqual(_.words('LETTERSAeiouAreVowels'), ['LETTERS', 'Aeiou', 'Are', 'Vowels']);
|
assert.deepStrictEqual(words('LETTERSAeiouAreVowels'), ['LETTERS', 'Aeiou', 'Are', 'Vowels']);
|
||||||
assert.deepStrictEqual(_.words('tooLegit2Quit'), ['too', 'Legit', '2', 'Quit']);
|
assert.deepStrictEqual(words('tooLegit2Quit'), ['too', 'Legit', '2', 'Quit']);
|
||||||
assert.deepStrictEqual(_.words('walk500Miles'), ['walk', '500', 'Miles']);
|
assert.deepStrictEqual(words('walk500Miles'), ['walk', '500', 'Miles']);
|
||||||
assert.deepStrictEqual(_.words('xhr2Request'), ['xhr', '2', 'Request']);
|
assert.deepStrictEqual(words('xhr2Request'), ['xhr', '2', 'Request']);
|
||||||
assert.deepStrictEqual(_.words('XMLHttp'), ['XML', 'Http']);
|
assert.deepStrictEqual(words('XMLHttp'), ['XML', 'Http']);
|
||||||
assert.deepStrictEqual(_.words('XmlHTTP'), ['Xml', 'HTTP']);
|
assert.deepStrictEqual(words('XmlHTTP'), ['Xml', 'HTTP']);
|
||||||
assert.deepStrictEqual(_.words('XmlHttp'), ['Xml', 'Http']);
|
assert.deepStrictEqual(words('XmlHttp'), ['Xml', 'Http']);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should work with compound words containing diacritical marks', function() {
|
it('should work with compound words containing diacritical marks', function() {
|
||||||
assert.deepStrictEqual(_.words('LETTERSÆiouAreVowels'), ['LETTERS', 'Æiou', 'Are', 'Vowels']);
|
assert.deepStrictEqual(words('LETTERSÆiouAreVowels'), ['LETTERS', 'Æiou', 'Are', 'Vowels']);
|
||||||
assert.deepStrictEqual(_.words('æiouAreVowels'), ['æiou', 'Are', 'Vowels']);
|
assert.deepStrictEqual(words('æiouAreVowels'), ['æiou', 'Are', 'Vowels']);
|
||||||
assert.deepStrictEqual(_.words('æiou2Consonants'), ['æiou', '2', 'Consonants']);
|
assert.deepStrictEqual(words('æiou2Consonants'), ['æiou', '2', 'Consonants']);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should not treat contractions as separate words', function() {
|
it('should not treat contractions as separate words', function() {
|
||||||
@@ -48,7 +49,7 @@ describe('words', function() {
|
|||||||
lodashStable.times(2, function(index) {
|
lodashStable.times(2, function(index) {
|
||||||
var actual = lodashStable.map(postfixes, function(postfix) {
|
var actual = lodashStable.map(postfixes, function(postfix) {
|
||||||
var string = 'a b' + apos + postfix + ' c';
|
var string = 'a b' + apos + postfix + ' c';
|
||||||
return _.words(string[index ? 'toUpperCase' : 'toLowerCase']());
|
return words(string[index ? 'toUpperCase' : 'toLowerCase']());
|
||||||
});
|
});
|
||||||
|
|
||||||
var expected = lodashStable.map(postfixes, function(postfix) {
|
var expected = lodashStable.map(postfixes, function(postfix) {
|
||||||
@@ -71,8 +72,8 @@ describe('words', function() {
|
|||||||
return [ordinal[index ? 'toUpperCase' : 'toLowerCase']()];
|
return [ordinal[index ? 'toUpperCase' : 'toLowerCase']()];
|
||||||
});
|
});
|
||||||
|
|
||||||
var actual = lodashStable.map(expected, function(words) {
|
var actual = lodashStable.map(expected, function(expectedWords) {
|
||||||
return _.words(words[0]);
|
return words(expectedWords[0]);
|
||||||
});
|
});
|
||||||
|
|
||||||
assert.deepStrictEqual(actual, expected);
|
assert.deepStrictEqual(actual, expected);
|
||||||
@@ -82,7 +83,7 @@ describe('words', function() {
|
|||||||
it('should not treat mathematical operators as words', function() {
|
it('should not treat mathematical operators as words', function() {
|
||||||
var operators = ['\xac', '\xb1', '\xd7', '\xf7'],
|
var operators = ['\xac', '\xb1', '\xd7', '\xf7'],
|
||||||
expected = lodashStable.map(operators, stubArray),
|
expected = lodashStable.map(operators, stubArray),
|
||||||
actual = lodashStable.map(operators, _.words);
|
actual = lodashStable.map(operators, words);
|
||||||
|
|
||||||
assert.deepStrictEqual(actual, expected);
|
assert.deepStrictEqual(actual, expected);
|
||||||
});
|
});
|
||||||
@@ -95,25 +96,18 @@ describe('words', function() {
|
|||||||
];
|
];
|
||||||
|
|
||||||
var expected = lodashStable.map(marks, stubArray),
|
var expected = lodashStable.map(marks, stubArray),
|
||||||
actual = lodashStable.map(marks, _.words);
|
actual = lodashStable.map(marks, words);
|
||||||
|
|
||||||
assert.deepStrictEqual(actual, expected);
|
assert.deepStrictEqual(actual, expected);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should work as an iteratee for methods like `_.map`', function() {
|
|
||||||
var strings = lodashStable.map(['a', 'b', 'c'], Object),
|
|
||||||
actual = lodashStable.map(strings, _.words);
|
|
||||||
|
|
||||||
assert.deepStrictEqual(actual, [['a'], ['b'], ['c']]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('should prevent ReDoS', function() {
|
it('should prevent ReDoS', function() {
|
||||||
var largeWordLen = 50000,
|
var largeWordLen = 50000,
|
||||||
largeWord = 'A'.repeat(largeWordLen),
|
largeWord = 'A'.repeat(largeWordLen),
|
||||||
maxMs = 1000,
|
maxMs = 1000,
|
||||||
startTime = lodashStable.now();
|
startTime = lodashStable.now();
|
||||||
|
|
||||||
assert.deepStrictEqual(_.words(largeWord + 'ÆiouAreVowels'), [largeWord, 'Æiou', 'Are', 'Vowels']);
|
assert.deepStrictEqual(words(largeWord + 'ÆiouAreVowels'), [largeWord, 'Æiou', 'Are', 'Vowels']);
|
||||||
|
|
||||||
var endTime = lodashStable.now(),
|
var endTime = lodashStable.now(),
|
||||||
timeSpent = endTime - startTime;
|
timeSpent = endTime - startTime;
|
||||||
11
words.js
11
words.js
@@ -1,13 +1,16 @@
|
|||||||
import unicodeWords from './.internal/unicodeWords.js'
|
import unicodeWords from './.internal/unicodeWords.js'
|
||||||
|
|
||||||
const asciiWords = RegExp.prototype.exec.bind(
|
|
||||||
/[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+/g
|
|
||||||
)
|
|
||||||
|
|
||||||
const hasUnicodeWord = RegExp.prototype.test.bind(
|
const hasUnicodeWord = RegExp.prototype.test.bind(
|
||||||
/[a-z][A-Z]|[A-Z]{2,}[a-z]|[0-9][a-zA-Z]|[a-zA-Z][0-9]|[^a-zA-Z0-9 ]/
|
/[a-z][A-Z]|[A-Z]{2,}[a-z]|[0-9][a-zA-Z]|[a-zA-Z][0-9]|[^a-zA-Z0-9 ]/
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/** Used to match words composed of alphanumeric characters. */
|
||||||
|
const reAsciiWord = /[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+/g
|
||||||
|
|
||||||
|
function asciiWords(string) {
|
||||||
|
return string.match(reAsciiWord)
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits `string` into an array of its words.
|
* Splits `string` into an array of its words.
|
||||||
*
|
*
|
||||||
|
|||||||
Reference in New Issue
Block a user