1 |
efrain |
1 |
YUI.add('text-wordbreak', function (Y, NAME) {
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
* Provides utility methods for splitting strings on word breaks and determining
|
|
|
5 |
* whether a character index represents a word boundary.
|
|
|
6 |
*
|
|
|
7 |
* @module text
|
|
|
8 |
* @submodule text-wordbreak
|
|
|
9 |
*/
|
|
|
10 |
|
|
|
11 |
/**
|
|
|
12 |
* <p>
|
|
|
13 |
* Provides utility methods for splitting strings on word breaks and determining
|
|
|
14 |
* whether a character index represents a word boundary, using the generic word
|
|
|
15 |
* breaking algorithm defined in the Unicode Text Segmentation guidelines
|
|
|
16 |
* (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
|
|
|
17 |
* Annex #29</a>).
|
|
|
18 |
* </p>
|
|
|
19 |
*
|
|
|
20 |
* <p>
|
|
|
21 |
* This algorithm provides a reasonable default for many languages. However, it
|
|
|
22 |
* does not cover language or context specific requirements, and it does not
|
|
|
23 |
* provide meaningful results at all for languages that don't use spaces between
|
|
|
24 |
* words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
|
|
|
25 |
* word breaking services usually provide significantly better results with
|
|
|
26 |
* better performance.
|
|
|
27 |
* </p>
|
|
|
28 |
*
|
|
|
29 |
* @class Text.WordBreak
|
|
|
30 |
* @static
|
|
|
31 |
*/
|
|
|
32 |
|
|
|
33 |
var Text = Y.Text,
|
|
|
34 |
WBData = Text.Data.WordBreak,
|
|
|
35 |
|
|
|
36 |
// Constants representing code point classifications.
|
|
|
37 |
ALETTER = 0,
|
|
|
38 |
MIDNUMLET = 1,
|
|
|
39 |
MIDLETTER = 2,
|
|
|
40 |
MIDNUM = 3,
|
|
|
41 |
NUMERIC = 4,
|
|
|
42 |
CR = 5,
|
|
|
43 |
LF = 6,
|
|
|
44 |
NEWLINE = 7,
|
|
|
45 |
EXTEND = 8,
|
|
|
46 |
FORMAT = 9,
|
|
|
47 |
KATAKANA = 10,
|
|
|
48 |
EXTENDNUMLET = 11,
|
|
|
49 |
OTHER = 12,
|
|
|
50 |
|
|
|
51 |
// RegExp objects generated from code point data. Each regex matches a single
|
|
|
52 |
// character against a set of Unicode code points. The index of each item in
|
|
|
53 |
// this array must match its corresponding code point constant value defined
|
|
|
54 |
// above.
|
|
|
55 |
SETS = [
|
|
|
56 |
new RegExp(WBData.aletter),
|
|
|
57 |
new RegExp(WBData.midnumlet),
|
|
|
58 |
new RegExp(WBData.midletter),
|
|
|
59 |
new RegExp(WBData.midnum),
|
|
|
60 |
new RegExp(WBData.numeric),
|
|
|
61 |
new RegExp(WBData.cr),
|
|
|
62 |
new RegExp(WBData.lf),
|
|
|
63 |
new RegExp(WBData.newline),
|
|
|
64 |
new RegExp(WBData.extend),
|
|
|
65 |
new RegExp(WBData.format),
|
|
|
66 |
new RegExp(WBData.katakana),
|
|
|
67 |
new RegExp(WBData.extendnumlet)
|
|
|
68 |
],
|
|
|
69 |
|
|
|
70 |
EMPTY_STRING = '',
|
|
|
71 |
PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'),
|
|
|
72 |
WHITESPACE = /\s/,
|
|
|
73 |
|
|
|
74 |
WordBreak = {
|
|
|
75 |
// -- Public Static Methods ------------------------------------------------
|
|
|
76 |
|
|
|
77 |
/**
|
|
|
78 |
* Splits the specified string into an array of individual words.
|
|
|
79 |
*
|
|
|
80 |
* @method getWords
|
|
|
81 |
* @param {String} string String to split.
|
|
|
82 |
* @param {Object} options (optional) Options object containing zero or more
|
|
|
83 |
* of the following properties:
|
|
|
84 |
*
|
|
|
85 |
* <dl>
|
|
|
86 |
* <dt>ignoreCase (Boolean)</dt>
|
|
|
87 |
* <dd>
|
|
|
88 |
* If <code>true</code>, the string will be converted to lowercase
|
|
|
89 |
* before being split. Default is <code>false</code>.
|
|
|
90 |
* </dd>
|
|
|
91 |
*
|
|
|
92 |
* <dt>includePunctuation (Boolean)</dt>
|
|
|
93 |
* <dd>
|
|
|
94 |
* If <code>true</code>, the returned array will include punctuation
|
|
|
95 |
* characters. Default is <code>false</code>.
|
|
|
96 |
* </dd>
|
|
|
97 |
*
|
|
|
98 |
* <dt>includeWhitespace (Boolean)</dt>
|
|
|
99 |
* <dd>
|
|
|
100 |
* If <code>true</code>, the returned array will include whitespace
|
|
|
101 |
* characters. Default is <code>false</code>.
|
|
|
102 |
* </dd>
|
|
|
103 |
* </dl>
|
|
|
104 |
* @return {Array} Array of words.
|
|
|
105 |
* @static
|
|
|
106 |
*/
|
|
|
107 |
getWords: function (string, options) {
|
|
|
108 |
var i = 0,
|
|
|
109 |
map = WordBreak._classify(string),
|
|
|
110 |
len = map.length,
|
|
|
111 |
word = [],
|
|
|
112 |
words = [],
|
|
|
113 |
chr,
|
|
|
114 |
includePunctuation,
|
|
|
115 |
includeWhitespace;
|
|
|
116 |
|
|
|
117 |
if (!options) {
|
|
|
118 |
options = {};
|
|
|
119 |
}
|
|
|
120 |
|
|
|
121 |
if (options.ignoreCase) {
|
|
|
122 |
string = string.toLowerCase();
|
|
|
123 |
}
|
|
|
124 |
|
|
|
125 |
includePunctuation = options.includePunctuation;
|
|
|
126 |
includeWhitespace = options.includeWhitespace;
|
|
|
127 |
|
|
|
128 |
// Loop through each character in the classification map and determine
|
|
|
129 |
// whether it precedes a word boundary, building an array of distinct
|
|
|
130 |
// words as we go.
|
|
|
131 |
for (; i < len; ++i) {
|
|
|
132 |
chr = string.charAt(i);
|
|
|
133 |
|
|
|
134 |
// Append this character to the current word.
|
|
|
135 |
word.push(chr);
|
|
|
136 |
|
|
|
137 |
// If there's a word boundary between the current character and the
|
|
|
138 |
// next character, append the current word to the words array and
|
|
|
139 |
// start building a new word.
|
|
|
140 |
if (WordBreak._isWordBoundary(map, i)) {
|
|
|
141 |
word = word.join(EMPTY_STRING);
|
|
|
142 |
|
|
|
143 |
if (word &&
|
|
|
144 |
(includeWhitespace || !WHITESPACE.test(word)) &&
|
|
|
145 |
(includePunctuation || !PUNCTUATION.test(word))) {
|
|
|
146 |
words.push(word);
|
|
|
147 |
}
|
|
|
148 |
|
|
|
149 |
word = [];
|
|
|
150 |
}
|
|
|
151 |
}
|
|
|
152 |
|
|
|
153 |
return words;
|
|
|
154 |
},
|
|
|
155 |
|
|
|
156 |
/**
|
|
|
157 |
* Returns an array containing only unique words from the specified string.
|
|
|
158 |
* For example, the string <code>'foo bar baz foo'</code> would result in
|
|
|
159 |
* the array <code>['foo', 'bar', 'baz']</code>.
|
|
|
160 |
*
|
|
|
161 |
* @method getUniqueWords
|
|
|
162 |
* @param {String} string String to split.
|
|
|
163 |
* @param {Object} options (optional) Options (see <code>getWords()</code>
|
|
|
164 |
* for details).
|
|
|
165 |
* @return {Array} Array of unique words.
|
|
|
166 |
* @static
|
|
|
167 |
*/
|
|
|
168 |
getUniqueWords: function (string, options) {
|
|
|
169 |
return Y.Array.unique(WordBreak.getWords(string, options));
|
|
|
170 |
},
|
|
|
171 |
|
|
|
172 |
/**
|
|
|
173 |
* <p>
|
|
|
174 |
* Returns <code>true</code> if there is a word boundary between the
|
|
|
175 |
* specified character index and the next character index (or the end of the
|
|
|
176 |
* string).
|
|
|
177 |
* </p>
|
|
|
178 |
*
|
|
|
179 |
* <p>
|
|
|
180 |
* Note that there are always word breaks at the beginning and end of a
|
|
|
181 |
* string, so <code>isWordBoundary('', 0)</code> and
|
|
|
182 |
* <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
|
|
|
183 |
* </p>
|
|
|
184 |
*
|
|
|
185 |
* @method isWordBoundary
|
|
|
186 |
* @param {String} string String to test.
|
|
|
187 |
* @param {Number} index Character index to test within the string.
|
|
|
188 |
* @return {Boolean} <code>true</code> for a word boundary,
|
|
|
189 |
* <code>false</code> otherwise.
|
|
|
190 |
* @static
|
|
|
191 |
*/
|
|
|
192 |
isWordBoundary: function (string, index) {
|
|
|
193 |
return WordBreak._isWordBoundary(WordBreak._classify(string), index);
|
|
|
194 |
},
|
|
|
195 |
|
|
|
196 |
// -- Protected Static Methods ---------------------------------------------
|
|
|
197 |
|
|
|
198 |
/**
|
|
|
199 |
* Returns a character classification map for the specified string.
|
|
|
200 |
*
|
|
|
201 |
* @method _classify
|
|
|
202 |
* @param {String} string String to classify.
|
|
|
203 |
* @return {Array} Classification map.
|
|
|
204 |
* @protected
|
|
|
205 |
* @static
|
|
|
206 |
*/
|
|
|
207 |
_classify: function (string) {
|
|
|
208 |
var chr,
|
|
|
209 |
map = [],
|
|
|
210 |
i = 0,
|
|
|
211 |
j,
|
|
|
212 |
set,
|
|
|
213 |
stringLength = string.length,
|
|
|
214 |
setsLength = SETS.length,
|
|
|
215 |
type;
|
|
|
216 |
|
|
|
217 |
for (; i < stringLength; ++i) {
|
|
|
218 |
chr = string.charAt(i);
|
|
|
219 |
type = OTHER;
|
|
|
220 |
|
|
|
221 |
for (j = 0; j < setsLength; ++j) {
|
|
|
222 |
set = SETS[j];
|
|
|
223 |
|
|
|
224 |
if (set && set.test(chr)) {
|
|
|
225 |
type = j;
|
|
|
226 |
break;
|
|
|
227 |
}
|
|
|
228 |
}
|
|
|
229 |
|
|
|
230 |
map.push(type);
|
|
|
231 |
}
|
|
|
232 |
|
|
|
233 |
return map;
|
|
|
234 |
},
|
|
|
235 |
|
|
|
236 |
/**
|
|
|
237 |
* <p>
|
|
|
238 |
* Returns <code>true</code> if there is a word boundary between the
|
|
|
239 |
* specified character index and the next character index (or the end of the
|
|
|
240 |
* string).
|
|
|
241 |
* </p>
|
|
|
242 |
*
|
|
|
243 |
* <p>
|
|
|
244 |
* Note that there are always word breaks at the beginning and end of a
|
|
|
245 |
* string, so <code>_isWordBoundary('', 0)</code> and
|
|
|
246 |
* <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
|
|
|
247 |
* </p>
|
|
|
248 |
*
|
|
|
249 |
* @method _isWordBoundary
|
|
|
250 |
* @param {Array} map Character classification map generated by
|
|
|
251 |
* <code>_classify</code>.
|
|
|
252 |
* @param {Number} index Character index to test.
|
|
|
253 |
* @return {Boolean}
|
|
|
254 |
* @protected
|
|
|
255 |
* @static
|
|
|
256 |
*/
|
|
|
257 |
_isWordBoundary: function (map, index) {
|
|
|
258 |
var prevType,
|
|
|
259 |
type = map[index],
|
|
|
260 |
nextType = map[index + 1],
|
|
|
261 |
nextNextType;
|
|
|
262 |
|
|
|
263 |
if (index < 0 || (index > map.length - 1 && index !== 0)) {
|
|
|
264 |
return false;
|
|
|
265 |
}
|
|
|
266 |
|
|
|
267 |
// WB5. Don't break between most letters.
|
|
|
268 |
if (type === ALETTER && nextType === ALETTER) {
|
|
|
269 |
return false;
|
|
|
270 |
}
|
|
|
271 |
|
|
|
272 |
nextNextType = map[index + 2];
|
|
|
273 |
|
|
|
274 |
// WB6. Don't break letters across certain punctuation.
|
|
|
275 |
if (type === ALETTER &&
|
|
|
276 |
(nextType === MIDLETTER || nextType === MIDNUMLET) &&
|
|
|
277 |
nextNextType === ALETTER) {
|
|
|
278 |
return false;
|
|
|
279 |
}
|
|
|
280 |
|
|
|
281 |
prevType = map[index - 1];
|
|
|
282 |
|
|
|
283 |
// WB7. Don't break letters across certain punctuation.
|
|
|
284 |
if ((type === MIDLETTER || type === MIDNUMLET) &&
|
|
|
285 |
nextType === ALETTER &&
|
|
|
286 |
prevType === ALETTER) {
|
|
|
287 |
return false;
|
|
|
288 |
}
|
|
|
289 |
|
|
|
290 |
// WB8/WB9/WB10. Don't break inside sequences of digits or digits
|
|
|
291 |
// adjacent to letters.
|
|
|
292 |
if ((type === NUMERIC || type === ALETTER) &&
|
|
|
293 |
(nextType === NUMERIC || nextType === ALETTER)) {
|
|
|
294 |
return false;
|
|
|
295 |
}
|
|
|
296 |
|
|
|
297 |
// WB11. Don't break inside numeric sequences like "3.2" or
|
|
|
298 |
// "3,456.789".
|
|
|
299 |
if ((type === MIDNUM || type === MIDNUMLET) &&
|
|
|
300 |
nextType === NUMERIC &&
|
|
|
301 |
prevType === NUMERIC) {
|
|
|
302 |
return false;
|
|
|
303 |
}
|
|
|
304 |
|
|
|
305 |
// WB12. Don't break inside numeric sequences like "3.2" or
|
|
|
306 |
// "3,456.789".
|
|
|
307 |
if (type === NUMERIC &&
|
|
|
308 |
(nextType === MIDNUM || nextType === MIDNUMLET) &&
|
|
|
309 |
nextNextType === NUMERIC) {
|
|
|
310 |
return false;
|
|
|
311 |
}
|
|
|
312 |
|
|
|
313 |
// WB4. Ignore format and extend characters.
|
|
|
314 |
if (type === EXTEND || type === FORMAT ||
|
|
|
315 |
prevType === EXTEND || prevType === FORMAT ||
|
|
|
316 |
nextType === EXTEND || nextType === FORMAT) {
|
|
|
317 |
return false;
|
|
|
318 |
}
|
|
|
319 |
|
|
|
320 |
// WB3. Don't break inside CRLF.
|
|
|
321 |
if (type === CR && nextType === LF) {
|
|
|
322 |
return false;
|
|
|
323 |
}
|
|
|
324 |
|
|
|
325 |
// WB3a. Break before newlines (including CR and LF).
|
|
|
326 |
if (type === NEWLINE || type === CR || type === LF) {
|
|
|
327 |
return true;
|
|
|
328 |
}
|
|
|
329 |
|
|
|
330 |
// WB3b. Break after newlines (including CR and LF).
|
|
|
331 |
if (nextType === NEWLINE || nextType === CR || nextType === LF) {
|
|
|
332 |
return true;
|
|
|
333 |
}
|
|
|
334 |
|
|
|
335 |
// WB13. Don't break between Katakana characters.
|
|
|
336 |
if (type === KATAKANA && nextType === KATAKANA) {
|
|
|
337 |
return false;
|
|
|
338 |
}
|
|
|
339 |
|
|
|
340 |
// WB13a. Don't break from extenders.
|
|
|
341 |
if (nextType === EXTENDNUMLET &&
|
|
|
342 |
(type === ALETTER || type === NUMERIC || type === KATAKANA ||
|
|
|
343 |
type === EXTENDNUMLET)) {
|
|
|
344 |
return false;
|
|
|
345 |
}
|
|
|
346 |
|
|
|
347 |
// WB13b. Don't break from extenders.
|
|
|
348 |
if (type === EXTENDNUMLET &&
|
|
|
349 |
(nextType === ALETTER || nextType === NUMERIC ||
|
|
|
350 |
nextType === KATAKANA)) {
|
|
|
351 |
return false;
|
|
|
352 |
}
|
|
|
353 |
|
|
|
354 |
// Break after any character not covered by the rules above.
|
|
|
355 |
return true;
|
|
|
356 |
}
|
|
|
357 |
};
|
|
|
358 |
|
|
|
359 |
Text.WordBreak = WordBreak;
|
|
|
360 |
|
|
|
361 |
|
|
|
362 |
}, '3.18.1', {"requires": ["array-extras", "text-data-wordbreak"]});
|