Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
YUI.add('text-wordbreak', function (Y, NAME) {
2
 
3
/**
4
 * Provides utility methods for splitting strings on word breaks and determining
5
 * whether a character index represents a word boundary.
6
 *
7
 * @module text
8
 * @submodule text-wordbreak
9
 */
10
 
11
/**
12
 * <p>
13
 * Provides utility methods for splitting strings on word breaks and determining
14
 * whether a character index represents a word boundary, using the generic word
15
 * breaking algorithm defined in the Unicode Text Segmentation guidelines
16
 * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
17
 * Annex #29</a>).
18
 * </p>
19
 *
20
 * <p>
21
 * This algorithm provides a reasonable default for many languages. However, it
22
 * does not cover language or context specific requirements, and it does not
23
 * provide meaningful results at all for languages that don't use spaces between
24
 * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
25
 * word breaking services usually provide significantly better results with
26
 * better performance.
27
 * </p>
28
 *
29
 * @class Text.WordBreak
30
 * @static
31
 */
32
 
33
var Text   = Y.Text,
34
    WBData = Text.Data.WordBreak,
35
 
36
// Constants representing code point classifications.
37
ALETTER      = 0,
38
MIDNUMLET    = 1,
39
MIDLETTER    = 2,
40
MIDNUM       = 3,
41
NUMERIC      = 4,
42
CR           = 5,
43
LF           = 6,
44
NEWLINE      = 7,
45
EXTEND       = 8,
46
FORMAT       = 9,
47
KATAKANA     = 10,
48
EXTENDNUMLET = 11,
49
OTHER        = 12,
50
 
51
// RegExp objects generated from code point data. Each regex matches a single
52
// character against a set of Unicode code points. The index of each item in
53
// this array must match its corresponding code point constant value defined
54
// above.
55
SETS = [
56
    new RegExp(WBData.aletter),
57
    new RegExp(WBData.midnumlet),
58
    new RegExp(WBData.midletter),
59
    new RegExp(WBData.midnum),
60
    new RegExp(WBData.numeric),
61
    new RegExp(WBData.cr),
62
    new RegExp(WBData.lf),
63
    new RegExp(WBData.newline),
64
    new RegExp(WBData.extend),
65
    new RegExp(WBData.format),
66
    new RegExp(WBData.katakana),
67
    new RegExp(WBData.extendnumlet)
68
],
69
 
70
EMPTY_STRING = '',
71
PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
72
WHITESPACE   = /\s/,
73
 
74
WordBreak = {
75
    // -- Public Static Methods ------------------------------------------------
76
 
77
    /**
78
     * Splits the specified string into an array of individual words.
79
     *
80
     * @method getWords
81
     * @param {String} string String to split.
82
     * @param {Object} options (optional) Options object containing zero or more
83
     *   of the following properties:
84
     *
85
     * <dl>
86
     *   <dt>ignoreCase (Boolean)</dt>
87
     *   <dd>
88
     *     If <code>true</code>, the string will be converted to lowercase
89
     *     before being split. Default is <code>false</code>.
90
     *   </dd>
91
     *
92
     *   <dt>includePunctuation (Boolean)</dt>
93
     *   <dd>
94
     *     If <code>true</code>, the returned array will include punctuation
95
     *     characters. Default is <code>false</code>.
96
     *   </dd>
97
     *
98
     *   <dt>includeWhitespace (Boolean)</dt>
99
     *   <dd>
100
     *     If <code>true</code>, the returned array will include whitespace
101
     *     characters. Default is <code>false</code>.
102
     *   </dd>
103
     * </dl>
104
     * @return {Array} Array of words.
105
     * @static
106
     */
107
    getWords: function (string, options) {
108
        var i     = 0,
109
            map   = WordBreak._classify(string),
110
            len   = map.length,
111
            word  = [],
112
            words = [],
113
            chr,
114
            includePunctuation,
115
            includeWhitespace;
116
 
117
        if (!options) {
118
            options = {};
119
        }
120
 
121
        if (options.ignoreCase) {
122
            string = string.toLowerCase();
123
        }
124
 
125
        includePunctuation = options.includePunctuation;
126
        includeWhitespace  = options.includeWhitespace;
127
 
128
        // Loop through each character in the classification map and determine
129
        // whether it precedes a word boundary, building an array of distinct
130
        // words as we go.
131
        for (; i < len; ++i) {
132
            chr = string.charAt(i);
133
 
134
            // Append this character to the current word.
135
            word.push(chr);
136
 
137
            // If there's a word boundary between the current character and the
138
            // next character, append the current word to the words array and
139
            // start building a new word.
140
            if (WordBreak._isWordBoundary(map, i)) {
141
                word = word.join(EMPTY_STRING);
142
 
143
                if (word &&
144
                        (includeWhitespace  || !WHITESPACE.test(word)) &&
145
                        (includePunctuation || !PUNCTUATION.test(word))) {
146
                    words.push(word);
147
                }
148
 
149
                word = [];
150
            }
151
        }
152
 
153
        return words;
154
    },
155
 
156
    /**
157
     * Returns an array containing only unique words from the specified string.
158
     * For example, the string <code>'foo bar baz foo'</code> would result in
159
     * the array <code>['foo', 'bar', 'baz']</code>.
160
     *
161
     * @method getUniqueWords
162
     * @param {String} string String to split.
163
     * @param {Object} options (optional) Options (see <code>getWords()</code>
164
     *   for details).
165
     * @return {Array} Array of unique words.
166
     * @static
167
     */
168
    getUniqueWords: function (string, options) {
169
        return Y.Array.unique(WordBreak.getWords(string, options));
170
    },
171
 
172
    /**
173
     * <p>
174
     * Returns <code>true</code> if there is a word boundary between the
175
     * specified character index and the next character index (or the end of the
176
     * string).
177
     * </p>
178
     *
179
     * <p>
180
     * Note that there are always word breaks at the beginning and end of a
181
     * string, so <code>isWordBoundary('', 0)</code> and
182
     * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
183
     * </p>
184
     *
185
     * @method isWordBoundary
186
     * @param {String} string String to test.
187
     * @param {Number} index Character index to test within the string.
188
     * @return {Boolean} <code>true</code> for a word boundary,
189
     *   <code>false</code> otherwise.
190
     * @static
191
     */
192
    isWordBoundary: function (string, index) {
193
        return WordBreak._isWordBoundary(WordBreak._classify(string), index);
194
    },
195
 
196
    // -- Protected Static Methods ---------------------------------------------
197
 
198
    /**
199
     * Returns a character classification map for the specified string.
200
     *
201
     * @method _classify
202
     * @param {String} string String to classify.
203
     * @return {Array} Classification map.
204
     * @protected
205
     * @static
206
     */
207
    _classify: function (string) {
208
        var chr,
209
            map          = [],
210
            i            = 0,
211
            j,
212
            set,
213
            stringLength = string.length,
214
            setsLength   = SETS.length,
215
            type;
216
 
217
        for (; i < stringLength; ++i) {
218
            chr  = string.charAt(i);
219
            type = OTHER;
220
 
221
            for (j = 0; j < setsLength; ++j) {
222
                set = SETS[j];
223
 
224
                if (set && set.test(chr)) {
225
                    type = j;
226
                    break;
227
                }
228
            }
229
 
230
            map.push(type);
231
        }
232
 
233
        return map;
234
    },
235
 
236
    /**
237
     * <p>
238
     * Returns <code>true</code> if there is a word boundary between the
239
     * specified character index and the next character index (or the end of the
240
     * string).
241
     * </p>
242
     *
243
     * <p>
244
     * Note that there are always word breaks at the beginning and end of a
245
     * string, so <code>_isWordBoundary('', 0)</code> and
246
     * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
247
     * </p>
248
     *
249
     * @method _isWordBoundary
250
     * @param {Array} map Character classification map generated by
251
     *   <code>_classify</code>.
252
     * @param {Number} index Character index to test.
253
     * @return {Boolean}
254
     * @protected
255
     * @static
256
     */
257
    _isWordBoundary: function (map, index) {
258
        var prevType,
259
            type     = map[index],
260
            nextType = map[index + 1],
261
            nextNextType;
262
 
263
        if (index < 0 || (index > map.length - 1 && index !== 0)) {
264
            Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');
265
            return false;
266
        }
267
 
268
        // WB5. Don't break between most letters.
269
        if (type === ALETTER && nextType === ALETTER) {
270
            return false;
271
        }
272
 
273
        nextNextType = map[index + 2];
274
 
275
        // WB6. Don't break letters across certain punctuation.
276
        if (type === ALETTER &&
277
                (nextType === MIDLETTER || nextType === MIDNUMLET) &&
278
                nextNextType === ALETTER) {
279
            return false;
280
        }
281
 
282
        prevType = map[index - 1];
283
 
284
        // WB7. Don't break letters across certain punctuation.
285
        if ((type === MIDLETTER || type === MIDNUMLET) &&
286
                nextType === ALETTER &&
287
                prevType === ALETTER) {
288
            return false;
289
        }
290
 
291
        // WB8/WB9/WB10. Don't break inside sequences of digits or digits
292
        // adjacent to letters.
293
        if ((type === NUMERIC || type === ALETTER) &&
294
                (nextType === NUMERIC || nextType === ALETTER)) {
295
            return false;
296
        }
297
 
298
        // WB11. Don't break inside numeric sequences like "3.2" or
299
        // "3,456.789".
300
        if ((type === MIDNUM || type === MIDNUMLET) &&
301
                nextType === NUMERIC &&
302
                prevType === NUMERIC) {
303
            return false;
304
        }
305
 
306
        // WB12. Don't break inside numeric sequences like "3.2" or
307
        // "3,456.789".
308
        if (type === NUMERIC &&
309
                (nextType === MIDNUM || nextType === MIDNUMLET) &&
310
                nextNextType === NUMERIC) {
311
            return false;
312
        }
313
 
314
        // WB4. Ignore format and extend characters.
315
        if (type === EXTEND || type === FORMAT ||
316
                prevType === EXTEND || prevType === FORMAT ||
317
                nextType === EXTEND || nextType === FORMAT) {
318
            return false;
319
        }
320
 
321
        // WB3. Don't break inside CRLF.
322
        if (type === CR && nextType === LF) {
323
            return false;
324
        }
325
 
326
        // WB3a. Break before newlines (including CR and LF).
327
        if (type === NEWLINE || type === CR || type === LF) {
328
            return true;
329
        }
330
 
331
        // WB3b. Break after newlines (including CR and LF).
332
        if (nextType === NEWLINE || nextType === CR || nextType === LF) {
333
            return true;
334
        }
335
 
336
        // WB13. Don't break between Katakana characters.
337
        if (type === KATAKANA && nextType === KATAKANA) {
338
            return false;
339
        }
340
 
341
        // WB13a. Don't break from extenders.
342
        if (nextType === EXTENDNUMLET &&
343
                (type === ALETTER || type === NUMERIC || type === KATAKANA ||
344
                type === EXTENDNUMLET)) {
345
            return false;
346
        }
347
 
348
        // WB13b. Don't break from extenders.
349
        if (type === EXTENDNUMLET &&
350
                (nextType === ALETTER || nextType === NUMERIC ||
351
                nextType === KATAKANA)) {
352
            return false;
353
        }
354
 
355
        // Break after any character not covered by the rules above.
356
        return true;
357
    }
358
};
359
 
360
Text.WordBreak = WordBreak;
361
 
362
 
363
}, '3.18.1', {"requires": ["array-extras", "text-data-wordbreak"]});