Make _.deburr handle combining diacritical marks. [closes #1070]

This commit is contained in:
jdalton
2015-03-23 20:50:04 -07:00
parent 763b003a11
commit 9d79cc7e87
2 changed files with 35 additions and 1 deletions

View File

@@ -87,6 +87,13 @@
reEvaluate = /<%([\s\S]+?)%>/g,
reInterpolate = /<%=([\s\S]+?)%>/g;
/**
* Used to match combining diacritical marks.
* See [Wikipedia](https://en.wikipedia.org/wiki/Combining_Diacritical_Marks)
* for more details.
*/
var reComboMarks = /[\u0300-\u036f\ufe20-\ufe23]/g;
/**
* Used to match ES template delimiters.
* See the [ES spec](https://people.mozilla.org/~jorendorff/es6-draft.html#sec-template-literal-lexical-components)
@@ -10073,7 +10080,7 @@
*/
function deburr(string) {
string = baseToString(string);
return string && string.replace(reLatin1, deburrLetter);
return string && string.replace(reLatin1, deburrLetter).replace(reComboMarks, '');
}
/**

View File

@@ -235,6 +235,22 @@
'\xef', '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff'
];
/** List of combining diacritical marks for spanning multiple characters. */
var comboHalfs = [
'\ufe20', '\ufe21', '\ufe22', '\ufe23'
];
/** List of common combining diacritical marks. */
var comboMarks = [
'\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030a', '\u030b', '\u030c', '\u030d', '\u030e', '\u030f',
'\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031a', '\u031b', '\u031c', '\u031d', '\u031e', '\u031f',
'\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032a', '\u032b', '\u032c', '\u032d', '\u032e', '\u032f',
'\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033a', '\u033b', '\u033c', '\u033d', '\u033e', '\u033f',
'\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034a', '\u034b', '\u034c', '\u034d', '\u034e', '\u034f',
'\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035a', '\u035b', '\u035c', '\u035d', '\u035e', '\u035f',
'\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036a', '\u036b', '\u036c', '\u036d', '\u036e', '\u036f'
];
/** List of `burredLetters` translated to basic latin letters. */
var deburredLetters = [
'A', 'A', 'A', 'A', 'A', 'A', 'Ae', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I',
@@ -3533,6 +3549,17 @@
deepEqual(actual, operators);
});
test('should deburr combining diacritical marks', 1, function() {
var values = comboMarks.concat(comboHalfs),
expected = _.map(values, _.constant('ei'));
var actual = _.map(values, function(chr) {
return _.deburr('e' + chr + 'i');
});
deepEqual(actual, expected);
});
}());
/*--------------------------------------------------------------------------*/