Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
// This file is part of Moodle - http://moodle.org/
3
//
4
// Moodle is free software: you can redistribute it and/or modify
5
// it under the terms of the GNU General Public License as published by
6
// the Free Software Foundation, either version 3 of the License, or
7
// (at your option) any later version.
8
//
9
// Moodle is distributed in the hope that it will be useful,
10
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
// GNU General Public License for more details.
13
//
14
// You should have received a copy of the GNU General Public License
15
// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
 
17
/**
18
 * Datasets manager.
19
 *
20
 * @package   core_analytics
21
 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22
 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23
 */
24
 
25
namespace core_analytics;
26
 
27
defined('MOODLE_INTERNAL') || die();
28
 
29
/**
30
 * Datasets manager.
31
 *
32
 * @package   core_analytics
33
 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
34
 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35
 */
36
class dataset_manager {
37
 
38
    /**
39
     * File area for labelled datasets.
40
     */
41
    const LABELLED_FILEAREA = 'labelled';
42
 
43
    /**
44
     * File area for unlabelled datasets.
45
     */
46
    const UNLABELLED_FILEAREA = 'unlabelled';
47
 
48
    /**
49
     * File area for exported datasets.
50
     */
51
    const EXPORT_FILEAREA = 'export';
52
 
53
    /**
54
     * Evaluation file file name.
55
     */
56
    const EVALUATION_FILENAME = 'evaluation.csv';
57
 
58
    /**
59
     * The model id.
60
     *
61
     * @var int
62
     */
63
    protected $modelid;
64
 
65
    /**
66
     * Range processor in use.
67
     *
68
     * @var string
69
     */
70
    protected $timesplittingid;
71
 
72
    /**
73
     * @var int
74
     */
75
    protected $analysableid;
76
 
77
    /**
78
     * Whether this is a dataset for evaluation or not.
79
     *
80
     * @var bool
81
     */
82
    protected $evaluation;
83
 
84
    /**
85
     * The dataset filearea. Must be one of the self::*_FILEAREA options.
86
     *
87
     * @var string
88
     */
89
    protected $filearea;
90
 
91
    /**
92
     * Constructor method.
93
     *
94
     * @throws \coding_exception
95
     * @param int $modelid
96
     * @param int $analysableid
97
     * @param string $timesplittingid
98
     * @param string $filearea
99
     * @param bool $evaluation
100
     * @return void
101
     */
102
    public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) {
103
 
104
        if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA &&
105
                $filearea !== self::UNLABELLED_FILEAREA) {
106
            throw new \coding_exception('Invalid provided filearea');
107
        }
108
 
109
        $this->modelid = $modelid;
110
        $this->analysableid = $analysableid;
111
        $this->timesplittingid = $timesplittingid;
112
        $this->filearea = $filearea;
113
        $this->evaluation = $evaluation;
114
    }
115
 
116
    /**
117
     * Store the dataset in the internal file system.
118
     *
119
     * @param array $data
120
     * @return \stored_file
121
     */
122
    public function store($data) {
123
 
124
        // Delete previous file if it exists.
125
        $fs = get_file_storage();
126
 
127
        $filerecord = [
128
            'component' => 'analytics',
129
            'filearea' => $this->filearea,
130
            'itemid' => $this->modelid,
131
            'contextid' => \context_system::instance()->id,
132
            'filepath' => '/analysable/' . $this->analysableid . '/' .
133
                \core_analytics\analysis::clean_time_splitting_id($this->timesplittingid) . '/',
134
            'filename' => self::get_filename($this->evaluation)
135
        ];
136
 
137
        // Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable.
138
        if ($this->evaluation) {
139
            $select = " = {$filerecord['itemid']} AND filepath = :filepath";
140
            $fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'],
141
                $select, array('filepath' => $filerecord['filepath']));
142
        }
143
 
144
        // Write all this stuff to a tmp file.
145
        $filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename'];
146
        $fh = fopen($filepath, 'w+');
147
        if (!$fh) {
148
            return false;
149
        }
150
        foreach ($data as $line) {
151
            fputcsv($fh, $line);
152
        }
153
        fclose($fh);
154
 
155
        return $fs->create_file_from_pathname($filerecord, $filepath);
156
    }
157
 
158
    /**
159
     * Returns the previous evaluation file.
160
     *
161
     * Important to note that this is per modelid + timesplittingid, when dealing with multiple
162
     * analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file
163
     *
164
     * @param int $modelid
165
     * @param string $timesplittingid
166
     * @return \stored_file
167
     */
168
    public static function get_previous_evaluation_file($modelid, $timesplittingid) {
169
        $fs = get_file_storage();
170
        // Evaluation data is always labelled.
171
        $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
172
        return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid,
173
            $filepath, self::EVALUATION_FILENAME);
174
    }
175
 
176
    /**
177
     * Gets the list of files that couldn't be previously used for training and prediction.
178
     *
179
     * @param int $modelid
180
     * @param bool $includetarget
181
     * @param string[] $timesplittingids
182
     * @return null
183
     */
184
    public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
185
        global $DB;
186
 
187
        $fs = get_file_storage();
188
 
189
        if ($includetarget) {
190
            $filearea = self::LABELLED_FILEAREA;
191
            $usedfileaction = 'trained';
192
        } else {
193
            $filearea = self::UNLABELLED_FILEAREA;
194
            $usedfileaction = 'predicted';
195
        }
196
 
197
        $select = 'modelid = :modelid AND action = :action';
198
        $params = array('modelid' => $modelid, 'action' => $usedfileaction);
199
        $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
200
 
201
        // Very likely that we will only have 1 time splitting method here.
202
        $filesbytimesplitting = array();
203
        foreach ($timesplittingids as $timesplittingid) {
204
 
205
            $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
206
            $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
207
            foreach ($files as $file) {
208
 
209
                // Discard evaluation files.
210
                if ($file->get_filename() === self::EVALUATION_FILENAME) {
211
                    continue;
212
                }
213
 
214
                // No dirs.
215
                if ($file->is_directory()) {
216
                    continue;
217
                }
218
 
219
                // Already used for training.
220
                if (in_array($file->get_id(), $usedfileids)) {
221
                    continue;
222
                }
223
 
224
                $filesbytimesplitting[$timesplittingid][] = $file;
225
            }
226
        }
227
 
228
        return $filesbytimesplitting;
229
    }
230
 
231
    /**
232
     * Deletes previous evaluation files of this model.
233
     *
234
     * @param int $modelid
235
     * @param string $timesplittingid
236
     * @return bool
237
     */
238
    public static function delete_previous_evaluation_file($modelid, $timesplittingid) {
239
        if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) {
240
            $file->delete();
241
            return true;
242
        }
243
 
244
        return false;
245
    }
246
 
247
    /**
248
     * Returns this (model + analysable + time splitting) file.
249
     *
250
     * @param int $modelid
251
     * @param int $analysableid
252
     * @param string $timesplittingid
253
     * @return \stored_file
254
     */
255
    public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) {
256
 
257
        // Delete previous file if it exists.
258
        $fs = get_file_storage();
259
 
260
        // Always evaluation.csv and labelled as it is an evaluation file.
261
        $filearea = self::LABELLED_FILEAREA;
262
        $filename = self::get_filename(true);
263
        $filepath = '/analysable/' . $analysableid . '/' .
264
            \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
265
        return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename);
266
    }
267
 
268
    /**
269
     * Merge multiple files into one.
270
     *
271
     * Important! It is the caller responsability to ensure that the datasets are compatible.
272
     *
273
     * @param array  $files
274
     * @param int    $modelid
275
     * @param string $timesplittingid
276
     * @param string $filearea
277
     * @param bool   $evaluation
278
     * @return \stored_file
279
     */
280
    public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) {
281
 
282
        $tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv';
283
 
284
        // Add headers.
285
        // We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file
286
        // once all file contents are merged.
287
        $varnames = '';
288
        $analysablesvalues = array();
289
        foreach ($files as $file) {
290
            $rh = $file->get_content_file_handle();
291
 
292
            // Copy the var names as they are, all files should have the same var names.
293
            $varnames = fgetcsv($rh);
294
 
295
            $analysablesvalues[] = fgetcsv($rh);
296
 
297
            // Copy the columns as they are, all files should have the same columns.
298
            $columns = fgetcsv($rh);
299
        }
300
 
301
        // Merge analysable values skipping the ones that are the same in all analysables.
302
        $values = array();
303
        foreach ($analysablesvalues as $analysablevalues) {
304
            foreach ($analysablevalues as $varkey => $value) {
305
                // Sha1 to make it unique.
306
                $values[$varkey][sha1($value)] = $value;
307
            }
308
        }
309
        foreach ($values as $varkey => $varvalues) {
310
            $values[$varkey] = implode('|', $varvalues);
311
        }
312
 
313
        // Start writing to the merge file.
314
        $wh = fopen($tmpfilepath, 'w');
315
        if (!$wh) {
316
            throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);
317
        }
318
 
319
        fputcsv($wh, $varnames);
320
        fputcsv($wh, $values);
321
        fputcsv($wh, $columns);
322
 
323
        // Iterate through all files and add them to the tmp one. We don't want file contents in memory.
324
        foreach ($files as $file) {
325
            $rh = $file->get_content_file_handle();
326
 
327
            // Skip headers.
328
            fgets($rh);
329
            fgets($rh);
330
            fgets($rh);
331
 
332
            // Copy all the following lines.
333
            while ($line = fgets($rh)) {
334
                fwrite($wh, $line);
335
            }
336
            fclose($rh);
337
        }
338
        fclose($wh);
339
 
340
        $filerecord = [
341
            'component' => 'analytics',
342
            'filearea' => $filearea,
343
            'itemid' => $modelid,
344
            'contextid' => \context_system::instance()->id,
345
            'filepath' => '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/',
346
            'filename' => self::get_filename($evaluation)
347
        ];
348
 
349
        $fs = get_file_storage();
350
 
351
        return $fs->create_file_from_pathname($filerecord, $tmpfilepath);
352
    }
353
 
354
    /**
355
     * Exports the model training data.
356
     *
357
     * @param int $modelid
358
     * @param string $timesplittingid
359
     * @return \stored_file|false
360
     */
361
    public static function export_training_data($modelid, $timesplittingid) {
362
 
363
        $fs = get_file_storage();
364
 
365
        $contextid = \context_system::instance()->id;
366
        $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
367
 
368
        $files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid,
369
            $filepath, true, false);
370
 
371
        // Discard evaluation files.
372
        foreach ($files as $key => $file) {
373
            if ($file->get_filename() === self::EVALUATION_FILENAME) {
374
                unset($files[$key]);
375
            }
376
        }
377
 
378
        if (empty($files)) {
379
            return false;
380
        }
381
 
382
        return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA);
383
    }
384
 
385
    /**
386
     * Returns the dataset file data structured by sampleids using the indicators and target column names.
387
     *
388
     * @param \stored_file $dataset
389
     * @return array
390
     */
391
    public static function get_structured_data(\stored_file $dataset) {
392
 
393
        if ($dataset->get_filearea() !== 'unlabelled') {
394
            throw new \coding_exception('Sorry, only support for unlabelled data');
395
        }
396
 
397
        $rh = $dataset->get_content_file_handle();
398
 
399
        // Skip dataset info.
400
        fgets($rh);
401
        fgets($rh);
402
 
403
        $calculations = array();
404
 
405
        $headers = fgetcsv($rh);
406
        // Get rid of the sampleid column name.
407
        array_shift($headers);
408
 
409
        while ($columns = fgetcsv($rh)) {
410
            $uniquesampleid = array_shift($columns);
411
 
412
            // Unfortunately fgetcsv does not respect line's var types.
413
            $calculations[$uniquesampleid] = array_map(function($value) {
414
 
415
                if ($value === '') {
416
                    // We really want them as null because converted to float become 0
417
                    // and we need to treat the values separately.
418
                    return null;
419
                } else if (is_numeric($value)) {
420
                    return floatval($value);
421
                }
422
                return $value;
423
            }, array_combine($headers, $columns));
424
        }
425
 
426
        return $calculations;
427
    }
428
 
429
    /**
430
     * Delete all files of a model.
431
     *
432
     * @param int $modelid
433
     * @return bool
434
     */
435
    public static function clear_model_files($modelid) {
436
        $fs = get_file_storage();
437
        return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid);
438
    }
439
 
440
    /**
441
     * Returns the file name to be used.
442
     *
443
     * @param strinbool $evaluation
444
     * @return string
445
     */
446
    protected static function get_filename($evaluation) {
447
 
448
        if ($evaluation === true) {
449
            $filename = self::EVALUATION_FILENAME;
450
        } else {
451
            // Incremental time, the lock will make sure we don't have concurrency problems.
452
            $filename = microtime(true) . '.csv';
453
        }
454
 
455
        return $filename;
456
    }
457
}