AutorÃa | Ultima modificación | Ver Log |
YUI.add('text-wordbreak', function (Y, NAME) {/*** Provides utility methods for splitting strings on word breaks and determining* whether a character index represents a word boundary.** @module text* @submodule text-wordbreak*//*** <p>* Provides utility methods for splitting strings on word breaks and determining* whether a character index represents a word boundary, using the generic word* breaking algorithm defined in the Unicode Text Segmentation guidelines* (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard* Annex #29</a>).* </p>** <p>* This algorithm provides a reasonable default for many languages. However, it* does not cover language or context specific requirements, and it does not* provide meaningful results at all for languages that don't use spaces between* words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based* word breaking services usually provide significantly better results with* better performance.* </p>** @class Text.WordBreak* @static*/var Text = Y.Text,WBData = Text.Data.WordBreak,// Constants representing code point classifications.ALETTER = 0,MIDNUMLET = 1,MIDLETTER = 2,MIDNUM = 3,NUMERIC = 4,CR = 5,LF = 6,NEWLINE = 7,EXTEND = 8,FORMAT = 9,KATAKANA = 10,EXTENDNUMLET = 11,OTHER = 12,// RegExp objects generated from code point data. Each regex matches a single// character against a set of Unicode code points. The index of each item in// this array must match its corresponding code point constant value defined// above.SETS = [new RegExp(WBData.aletter),new RegExp(WBData.midnumlet),new RegExp(WBData.midletter),new RegExp(WBData.midnum),new RegExp(WBData.numeric),new RegExp(WBData.cr),new RegExp(WBData.lf),new RegExp(WBData.newline),new RegExp(WBData.extend),new RegExp(WBData.format),new RegExp(WBData.katakana),new RegExp(WBData.extendnumlet)],EMPTY_STRING = '',PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'),WHITESPACE = /\s/,WordBreak = {// -- Public Static Methods ------------------------------------------------/*** Splits the specified string into an array of individual words.** @method getWords* @param {String} string String to split.* @param {Object} options (optional) Options object containing zero or more* of the following properties:** <dl>* <dt>ignoreCase (Boolean)</dt>* <dd>* If <code>true</code>, the string will be converted to lowercase* before being split. Default is <code>false</code>.* </dd>** <dt>includePunctuation (Boolean)</dt>* <dd>* If <code>true</code>, the returned array will include punctuation* characters. Default is <code>false</code>.* </dd>** <dt>includeWhitespace (Boolean)</dt>* <dd>* If <code>true</code>, the returned array will include whitespace* characters. Default is <code>false</code>.* </dd>* </dl>* @return {Array} Array of words.* @static*/getWords: function (string, options) {var i = 0,map = WordBreak._classify(string),len = map.length,word = [],words = [],chr,includePunctuation,includeWhitespace;if (!options) {options = {};}if (options.ignoreCase) {string = string.toLowerCase();}includePunctuation = options.includePunctuation;includeWhitespace = options.includeWhitespace;// Loop through each character in the classification map and determine// whether it precedes a word boundary, building an array of distinct// words as we go.for (; i < len; ++i) {chr = string.charAt(i);// Append this character to the current word.word.push(chr);// If there's a word boundary between the current character and the// next character, append the current word to the words array and// start building a new word.if (WordBreak._isWordBoundary(map, i)) {word = word.join(EMPTY_STRING);if (word &&(includeWhitespace || !WHITESPACE.test(word)) &&(includePunctuation || !PUNCTUATION.test(word))) {words.push(word);}word = [];}}return words;},/*** Returns an array containing only unique words from the specified string.* For example, the string <code>'foo bar baz foo'</code> would result in* the array <code>['foo', 'bar', 'baz']</code>.** @method getUniqueWords* @param {String} string String to split.* @param {Object} options (optional) Options (see <code>getWords()</code>* for details).* @return {Array} Array of unique words.* @static*/getUniqueWords: function (string, options) {return Y.Array.unique(WordBreak.getWords(string, options));},/*** <p>* Returns <code>true</code> if there is a word boundary between the* specified character index and the next character index (or the end of the* string).* </p>** <p>* Note that there are always word breaks at the beginning and end of a* string, so <code>isWordBoundary('', 0)</code> and* <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.* </p>** @method isWordBoundary* @param {String} string String to test.* @param {Number} index Character index to test within the string.* @return {Boolean} <code>true</code> for a word boundary,* <code>false</code> otherwise.* @static*/isWordBoundary: function (string, index) {return WordBreak._isWordBoundary(WordBreak._classify(string), index);},// -- Protected Static Methods ---------------------------------------------/*** Returns a character classification map for the specified string.** @method _classify* @param {String} string String to classify.* @return {Array} Classification map.* @protected* @static*/_classify: function (string) {var chr,map = [],i = 0,j,set,stringLength = string.length,setsLength = SETS.length,type;for (; i < stringLength; ++i) {chr = string.charAt(i);type = OTHER;for (j = 0; j < setsLength; ++j) {set = SETS[j];if (set && set.test(chr)) {type = j;break;}}map.push(type);}return map;},/*** <p>* Returns <code>true</code> if there is a word boundary between the* specified character index and the next character index (or the end of the* string).* </p>** <p>* Note that there are always word breaks at the beginning and end of a* string, so <code>_isWordBoundary('', 0)</code> and* <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.* </p>** @method _isWordBoundary* @param {Array} map Character classification map generated by* <code>_classify</code>.* @param {Number} index Character index to test.* @return {Boolean}* @protected* @static*/_isWordBoundary: function (map, index) {var prevType,type = map[index],nextType = map[index + 1],nextNextType;if (index < 0 || (index > map.length - 1 && index !== 0)) {Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');return false;}// WB5. Don't break between most letters.if (type === ALETTER && nextType === ALETTER) {return false;}nextNextType = map[index + 2];// WB6. Don't break letters across certain punctuation.if (type === ALETTER &&(nextType === MIDLETTER || nextType === MIDNUMLET) &&nextNextType === ALETTER) {return false;}prevType = map[index - 1];// WB7. Don't break letters across certain punctuation.if ((type === MIDLETTER || type === MIDNUMLET) &&nextType === ALETTER &&prevType === ALETTER) {return false;}// WB8/WB9/WB10. Don't break inside sequences of digits or digits// adjacent to letters.if ((type === NUMERIC || type === ALETTER) &&(nextType === NUMERIC || nextType === ALETTER)) {return false;}// WB11. Don't break inside numeric sequences like "3.2" or// "3,456.789".if ((type === MIDNUM || type === MIDNUMLET) &&nextType === NUMERIC &&prevType === NUMERIC) {return false;}// WB12. Don't break inside numeric sequences like "3.2" or// "3,456.789".if (type === NUMERIC &&(nextType === MIDNUM || nextType === MIDNUMLET) &&nextNextType === NUMERIC) {return false;}// WB4. Ignore format and extend characters.if (type === EXTEND || type === FORMAT ||prevType === EXTEND || prevType === FORMAT ||nextType === EXTEND || nextType === FORMAT) {return false;}// WB3. Don't break inside CRLF.if (type === CR && nextType === LF) {return false;}// WB3a. Break before newlines (including CR and LF).if (type === NEWLINE || type === CR || type === LF) {return true;}// WB3b. Break after newlines (including CR and LF).if (nextType === NEWLINE || nextType === CR || nextType === LF) {return true;}// WB13. Don't break between Katakana characters.if (type === KATAKANA && nextType === KATAKANA) {return false;}// WB13a. Don't break from extenders.if (nextType === EXTENDNUMLET &&(type === ALETTER || type === NUMERIC || type === KATAKANA ||type === EXTENDNUMLET)) {return false;}// WB13b. Don't break from extenders.if (type === EXTENDNUMLET &&(nextType === ALETTER || nextType === NUMERIC ||nextType === KATAKANA)) {return false;}// Break after any character not covered by the rules above.return true;}};Text.WordBreak = WordBreak;}, '3.18.1', {"requires": ["array-extras", "text-data-wordbreak"]});