Files
lodash/_unicodeWords.js
2017-01-06 18:02:01 -08:00

70 lines
3.0 KiB
JavaScript

/** Used to compose unicode character classes. */
const rsAstralRange = '\\ud800-\\udfff';
const rsComboMarksRange = '\\u0300-\\u036f';
const reComboHalfMarksRange = '\\ufe20-\\ufe2f';
const rsComboSymbolsRange = '\\u20d0-\\u20ff';
const rsComboRange = rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange;
const rsDingbatRange = '\\u2700-\\u27bf';
const rsLowerRange = 'a-z\\xdf-\\xf6\\xf8-\\xff';
const rsMathOpRange = '\\xac\\xb1\\xd7\\xf7';
const rsNonCharRange = '\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf';
const rsPunctuationRange = '\\u2000-\\u206f';
const rsSpaceRange = ' \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000';
const rsUpperRange = 'A-Z\\xc0-\\xd6\\xd8-\\xde';
const rsVarRange = '\\ufe0e\\ufe0f';
const rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange;
/** Used to compose unicode capture groups. */
const rsApos = "['\u2019]";
const rsBreak = `[${ rsBreakRange }]`;
const rsCombo = `[${ rsComboRange }]`;
const rsDigits = '\\d+';
const rsDingbat = `[${ rsDingbatRange }]`;
const rsLower = `[${ rsLowerRange }]`;
const rsMisc = `[^${ rsAstralRange }${ rsBreakRange + rsDigits + rsDingbatRange + rsLowerRange + rsUpperRange }]`;
const rsFitz = '\\ud83c[\\udffb-\\udfff]';
const rsModifier = `(?:${ rsCombo }|${ rsFitz })`;
const rsNonAstral = `[^${ rsAstralRange }]`;
const rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}';
const rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]';
const rsUpper = `[${ rsUpperRange }]`;
const rsZWJ = '\\u200d';
/** Used to compose unicode regexes. */
const rsMiscLower = `(?:${ rsLower }|${ rsMisc })`;
const rsMiscUpper = `(?:${ rsUpper }|${ rsMisc })`;
const rsOptContrLower = `(?:${ rsApos }(?:d|ll|m|re|s|t|ve))?`;
const rsOptContrUpper = `(?:${ rsApos }(?:D|LL|M|RE|S|T|VE))?`;
const reOptMod = `${ rsModifier }?`;
const rsOptVar = `[${ rsVarRange }]?`;
const rsOptJoin = `(?:${ rsZWJ }(?:${ [rsNonAstral, rsRegional, rsSurrPair].join('|') })${ rsOptVar + reOptMod })*`;
const rsOrdLower = '\\d*(?:(?:1st|2nd|3rd|(?![123])\\dth)\\b)';
const rsOrdUpper = '\\d*(?:(?:1ST|2ND|3RD|(?![123])\\dTH)\\b)';
const rsSeq = rsOptVar + reOptMod + rsOptJoin;
const rsEmoji = `(?:${ [rsDingbat, rsRegional, rsSurrPair].join('|') })${ rsSeq }`;
/** Used to match complex or compound words. */
const reUnicodeWord = RegExp([
`${ rsUpper }?${ rsLower }+${ rsOptContrLower }(?=${ [rsBreak, rsUpper, '$'].join('|') })`,
`${ rsMiscUpper }+${ rsOptContrUpper }(?=${ [rsBreak, rsUpper + rsMiscLower, '$'].join('|') })`,
`${ rsUpper }?${ rsMiscLower}+${ rsOptContrLower }`,
`${ rsUpper }+${ rsOptContrUpper }`,
rsOrdUpper,
rsOrdLower,
rsDigits,
rsEmoji
].join('|'), 'g');
/**
* Splits a Unicode `string` into an array of its words.
*
* @private
* @param {string} The string to inspect.
* @returns {Array} Returns the words of `string`.
*/
function unicodeWords(string) {
return string.match(reUnicodeWord) || [];
}
export default unicodeWords;