Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
YUI.add('text-wordbreak', function (Y, NAME) {
2
 
3
/**
4
 * Provides utility methods for splitting strings on word breaks and determining
5
 * whether a character index represents a word boundary.
6
 *
7
 * @module text
8
 * @submodule text-wordbreak
9
 */
10
 
11
/**
12
 * <p>
13
 * Provides utility methods for splitting strings on word breaks and determining
14
 * whether a character index represents a word boundary, using the generic word
15
 * breaking algorithm defined in the Unicode Text Segmentation guidelines
16
 * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
17
 * Annex #29</a>).
18
 * </p>
19
 *
20
 * <p>
21
 * This algorithm provides a reasonable default for many languages. However, it
22
 * does not cover language or context specific requirements, and it does not
23
 * provide meaningful results at all for languages that don't use spaces between
24
 * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
25
 * word breaking services usually provide significantly better results with
26
 * better performance.
27
 * </p>
28
 *
29
 * @class Text.WordBreak
30
 * @static
31
 */
32
 
33
var Text   = Y.Text,
34
    WBData = Text.Data.WordBreak,
35
 
36
// Constants representing code point classifications.
37
ALETTER      = 0,
38
MIDNUMLET    = 1,
39
MIDLETTER    = 2,
40
MIDNUM       = 3,
41
NUMERIC      = 4,
42
CR           = 5,
43
LF           = 6,
44
NEWLINE      = 7,
45
EXTEND       = 8,
46
FORMAT       = 9,
47
KATAKANA     = 10,
48
EXTENDNUMLET = 11,
49
OTHER        = 12,
50
 
51
// RegExp objects generated from code point data. Each regex matches a single
52
// character against a set of Unicode code points. The index of each item in
53
// this array must match its corresponding code point constant value defined
54
// above.
55
SETS = [
56
    new RegExp(WBData.aletter),
57
    new RegExp(WBData.midnumlet),
58
    new RegExp(WBData.midletter),
59
    new RegExp(WBData.midnum),
60
    new RegExp(WBData.numeric),
61
    new RegExp(WBData.cr),
62
    new RegExp(WBData.lf),
63
    new RegExp(WBData.newline),
64
    new RegExp(WBData.extend),
65
    new RegExp(WBData.format),
66
    new RegExp(WBData.katakana),
67
    new RegExp(WBData.extendnumlet)
68
],
69
 
70
EMPTY_STRING = '',
71
PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
72
WHITESPACE   = /\s/,
73
 
74
WordBreak = {
75
    // -- Public Static Methods ------------------------------------------------
76
 
77
    /**
78
     * Splits the specified string into an array of individual words.
79
     *
80
     * @method getWords
81
     * @param {String} string String to split.
82
     * @param {Object} options (optional) Options object containing zero or more
83
     *   of the following properties:
84
     *
85
     * <dl>
86
     *   <dt>ignoreCase (Boolean)</dt>
87
     *   <dd>
88
     *     If <code>true</code>, the string will be converted to lowercase
89
     *     before being split. Default is <code>false</code>.
90
     *   </dd>
91
     *
92
     *   <dt>includePunctuation (Boolean)</dt>
93
     *   <dd>
94
     *     If <code>true</code>, the returned array will include punctuation
95
     *     characters. Default is <code>false</code>.
96
     *   </dd>
97
     *
98
     *   <dt>includeWhitespace (Boolean)</dt>
99
     *   <dd>
100
     *     If <code>true</code>, the returned array will include whitespace
101
     *     characters. Default is <code>false</code>.
102
     *   </dd>
103
     * </dl>
104
     * @return {Array} Array of words.
105
     * @static
106
     */
107
    getWords: function (string, options) {
108
        var i     = 0,
109
            map   = WordBreak._classify(string),
110
            len   = map.length,
111
            word  = [],
112
            words = [],
113
            chr,
114
            includePunctuation,
115
            includeWhitespace;
116
 
117
        if (!options) {
118
            options = {};
119
        }
120
 
121
        if (options.ignoreCase) {
122
            string = string.toLowerCase();
123
        }
124
 
125
        includePunctuation = options.includePunctuation;
126
        includeWhitespace  = options.includeWhitespace;
127
 
128
        // Loop through each character in the classification map and determine
129
        // whether it precedes a word boundary, building an array of distinct
130
        // words as we go.
131
        for (; i < len; ++i) {
132
            chr = string.charAt(i);
133
 
134
            // Append this character to the current word.
135
            word.push(chr);
136
 
137
            // If there's a word boundary between the current character and the
138
            // next character, append the current word to the words array and
139
            // start building a new word.
140
            if (WordBreak._isWordBoundary(map, i)) {
141
                word = word.join(EMPTY_STRING);
142
 
143
                if (word &&
144
                        (includeWhitespace  || !WHITESPACE.test(word)) &&
145
                        (includePunctuation || !PUNCTUATION.test(word))) {
146
                    words.push(word);
147
                }
148
 
149
                word = [];
150
            }
151
        }
152
 
153
        return words;
154
    },
155
 
156
    /**
157
     * Returns an array containing only unique words from the specified string.
158
     * For example, the string <code>'foo bar baz foo'</code> would result in
159
     * the array <code>['foo', 'bar', 'baz']</code>.
160
     *
161
     * @method getUniqueWords
162
     * @param {String} string String to split.
163
     * @param {Object} options (optional) Options (see <code>getWords()</code>
164
     *   for details).
165
     * @return {Array} Array of unique words.
166
     * @static
167
     */
168
    getUniqueWords: function (string, options) {
169
        return Y.Array.unique(WordBreak.getWords(string, options));
170
    },
171
 
172
    /**
173
     * <p>
174
     * Returns <code>true</code> if there is a word boundary between the
175
     * specified character index and the next character index (or the end of the
176
     * string).
177
     * </p>
178
     *
179
     * <p>
180
     * Note that there are always word breaks at the beginning and end of a
181
     * string, so <code>isWordBoundary('', 0)</code> and
182
     * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
183
     * </p>
184
     *
185
     * @method isWordBoundary
186
     * @param {String} string String to test.
187
     * @param {Number} index Character index to test within the string.
188
     * @return {Boolean} <code>true</code> for a word boundary,
189
     *   <code>false</code> otherwise.
190
     * @static
191
     */
192
    isWordBoundary: function (string, index) {
193
        return WordBreak._isWordBoundary(WordBreak._classify(string), index);
194
    },
195
 
196
    // -- Protected Static Methods ---------------------------------------------
197
 
198
    /**
199
     * Returns a character classification map for the specified string.
200
     *
201
     * @method _classify
202
     * @param {String} string String to classify.
203
     * @return {Array} Classification map.
204
     * @protected
205
     * @static
206
     */
207
    _classify: function (string) {
208
        var chr,
209
            map          = [],
210
            i            = 0,
211
            j,
212
            set,
213
            stringLength = string.length,
214
            setsLength   = SETS.length,
215
            type;
216
 
217
        for (; i < stringLength; ++i) {
218
            chr  = string.charAt(i);
219
            type = OTHER;
220
 
221
            for (j = 0; j < setsLength; ++j) {
222
                set = SETS[j];
223
 
224
                if (set && set.test(chr)) {
225
                    type = j;
226
                    break;
227
                }
228
            }
229
 
230
            map.push(type);
231
        }
232
 
233
        return map;
234
    },
235
 
236
    /**
237
     * <p>
238
     * Returns <code>true</code> if there is a word boundary between the
239
     * specified character index and the next character index (or the end of the
240
     * string).
241
     * </p>
242
     *
243
     * <p>
244
     * Note that there are always word breaks at the beginning and end of a
245
     * string, so <code>_isWordBoundary('', 0)</code> and
246
     * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
247
     * </p>
248
     *
249
     * @method _isWordBoundary
250
     * @param {Array} map Character classification map generated by
251
     *   <code>_classify</code>.
252
     * @param {Number} index Character index to test.
253
     * @return {Boolean}
254
     * @protected
255
     * @static
256
     */
257
    _isWordBoundary: function (map, index) {
258
        var prevType,
259
            type     = map[index],
260
            nextType = map[index + 1],
261
            nextNextType;
262
 
263
        if (index < 0 || (index > map.length - 1 && index !== 0)) {
264
            return false;
265
        }
266
 
267
        // WB5. Don't break between most letters.
268
        if (type === ALETTER && nextType === ALETTER) {
269
            return false;
270
        }
271
 
272
        nextNextType = map[index + 2];
273
 
274
        // WB6. Don't break letters across certain punctuation.
275
        if (type === ALETTER &&
276
                (nextType === MIDLETTER || nextType === MIDNUMLET) &&
277
                nextNextType === ALETTER) {
278
            return false;
279
        }
280
 
281
        prevType = map[index - 1];
282
 
283
        // WB7. Don't break letters across certain punctuation.
284
        if ((type === MIDLETTER || type === MIDNUMLET) &&
285
                nextType === ALETTER &&
286
                prevType === ALETTER) {
287
            return false;
288
        }
289
 
290
        // WB8/WB9/WB10. Don't break inside sequences of digits or digits
291
        // adjacent to letters.
292
        if ((type === NUMERIC || type === ALETTER) &&
293
                (nextType === NUMERIC || nextType === ALETTER)) {
294
            return false;
295
        }
296
 
297
        // WB11. Don't break inside numeric sequences like "3.2" or
298
        // "3,456.789".
299
        if ((type === MIDNUM || type === MIDNUMLET) &&
300
                nextType === NUMERIC &&
301
                prevType === NUMERIC) {
302
            return false;
303
        }
304
 
305
        // WB12. Don't break inside numeric sequences like "3.2" or
306
        // "3,456.789".
307
        if (type === NUMERIC &&
308
                (nextType === MIDNUM || nextType === MIDNUMLET) &&
309
                nextNextType === NUMERIC) {
310
            return false;
311
        }
312
 
313
        // WB4. Ignore format and extend characters.
314
        if (type === EXTEND || type === FORMAT ||
315
                prevType === EXTEND || prevType === FORMAT ||
316
                nextType === EXTEND || nextType === FORMAT) {
317
            return false;
318
        }
319
 
320
        // WB3. Don't break inside CRLF.
321
        if (type === CR && nextType === LF) {
322
            return false;
323
        }
324
 
325
        // WB3a. Break before newlines (including CR and LF).
326
        if (type === NEWLINE || type === CR || type === LF) {
327
            return true;
328
        }
329
 
330
        // WB3b. Break after newlines (including CR and LF).
331
        if (nextType === NEWLINE || nextType === CR || nextType === LF) {
332
            return true;
333
        }
334
 
335
        // WB13. Don't break between Katakana characters.
336
        if (type === KATAKANA && nextType === KATAKANA) {
337
            return false;
338
        }
339
 
340
        // WB13a. Don't break from extenders.
341
        if (nextType === EXTENDNUMLET &&
342
                (type === ALETTER || type === NUMERIC || type === KATAKANA ||
343
                type === EXTENDNUMLET)) {
344
            return false;
345
        }
346
 
347
        // WB13b. Don't break from extenders.
348
        if (type === EXTENDNUMLET &&
349
                (nextType === ALETTER || nextType === NUMERIC ||
350
                nextType === KATAKANA)) {
351
            return false;
352
        }
353
 
354
        // Break after any character not covered by the rules above.
355
        return true;
356
    }
357
};
358
 
359
Text.WordBreak = WordBreak;
360
 
361
 
362
}, '3.18.1', {"requires": ["array-extras", "text-data-wordbreak"]});