Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
// This file is part of Moodle - http://moodle.org/
3
//
4
// Moodle is free software: you can redistribute it and/or modify
5
// it under the terms of the GNU General Public License as published by
6
// the Free Software Foundation, either version 3 of the License, or
7
// (at your option) any later version.
8
//
9
// Moodle is distributed in the hope that it will be useful,
10
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
// GNU General Public License for more details.
13
//
14
// You should have received a copy of the GNU General Public License
15
// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
 
17
/**
18
 * Php predictions processor
19
 *
20
 * @package   mlbackend_php
21
 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22
 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23
 */
24
 
25
namespace mlbackend_php;
26
 
27
defined('MOODLE_INTERNAL') || die();
28
 
29
use Phpml\Preprocessing\Normalizer;
30
use Phpml\CrossValidation\RandomSplit;
31
use Phpml\Dataset\ArrayDataset;
32
use Phpml\ModelManager;
33
use Phpml\Classification\Linear\LogisticRegression;
34
use Phpml\Metric\ClassificationReport;
35
 
36
/**
37
 * PHP predictions processor.
38
 *
39
 * @package   mlbackend_php
40
 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
41
 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
42
 */
43
class processor implements \core_analytics\classifier, \core_analytics\regressor, \core_analytics\packable {
44
 
45
    /**
46
     * Size of training / prediction batches.
47
     */
48
    const BATCH_SIZE = 5000;
49
 
50
    /**
51
     * Number of train iterations.
52
     */
53
    const TRAIN_ITERATIONS = 500;
54
 
55
    /**
56
     * File name of the serialised model.
57
     */
58
    const MODEL_FILENAME = 'model.ser';
59
 
60
    /**
61
     * @var bool
62
     */
63
    protected $limitedsize = false;
64
 
65
    /**
66
     * Checks if the processor is ready to use.
67
     *
68
     * @return bool
69
     */
70
    public function is_ready() {
71
        if (version_compare(phpversion(), '7.0.0') < 0) {
72
            return get_string('errorphp7required', 'mlbackend_php');
73
        }
74
        return true;
75
    }
76
 
77
    /**
78
     * Delete the stored models.
79
     *
80
     * @param string $uniqueid
81
     * @param string $modelversionoutputdir
82
     * @return null
83
     */
84
    public function clear_model($uniqueid, $modelversionoutputdir) {
85
        remove_dir($modelversionoutputdir);
86
    }
87
 
88
    /**
89
     * Delete the output directory.
90
     *
91
     * @param string $modeloutputdir
92
     * @param string $uniqueid
93
     * @return null
94
     */
95
    public function delete_output_dir($modeloutputdir, $uniqueid) {
96
        remove_dir($modeloutputdir);
97
    }
98
 
99
    /**
100
     * Train this processor classification model using the provided supervised learning dataset.
101
     *
102
     * @param string $uniqueid
103
     * @param \stored_file $dataset
104
     * @param string $outputdir
105
     * @return \stdClass
106
     */
107
    public function train_classification($uniqueid, \stored_file $dataset, $outputdir) {
108
 
109
        $modelfilepath = $this->get_model_filepath($outputdir);
110
 
111
        $modelmanager = new ModelManager();
112
 
113
        if (file_exists($modelfilepath)) {
114
            $classifier = $modelmanager->restoreFromFile($modelfilepath);
115
        } else {
116
            $classifier = $this->instantiate_algorithm();
117
        }
118
 
119
        $fh = $dataset->get_content_file_handle();
120
 
121
        // The first lines are var names and the second one values.
122
        $metadata = $this->extract_metadata($fh);
123
 
124
        // Skip headers.
125
        fgets($fh);
126
 
127
        $samples = array();
128
        $targets = array();
129
        while (($data = fgetcsv($fh)) !== false) {
130
            $sampledata = array_map('floatval', $data);
131
            $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
132
            $targets[] = intval($data[$metadata['nfeatures']]);
133
 
134
            $nsamples = count($samples);
135
            if ($nsamples === self::BATCH_SIZE) {
136
                // Training it batches to avoid running out of memory.
137
                $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));
138
                $samples = array();
139
                $targets = array();
140
            }
141
            if (empty($morethan1sample) && $nsamples > 1) {
142
                $morethan1sample = true;
143
            }
144
        }
145
        fclose($fh);
146
 
147
        if (empty($morethan1sample)) {
148
            $resultobj = new \stdClass();
149
            $resultobj->status = \core_analytics\model::NO_DATASET;
150
            $resultobj->info = array();
151
            return $resultobj;
152
        }
153
 
154
        // Train the remaining samples.
155
        if ($samples) {
156
            $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));
157
        }
158
 
159
        $resultobj = new \stdClass();
160
        $resultobj->status = \core_analytics\model::OK;
161
        $resultobj->info = array();
162
 
163
        // Store the trained model.
164
        $modelmanager->saveToFile($classifier, $modelfilepath);
165
 
166
        return $resultobj;
167
    }
168
 
169
    /**
170
     * Classifies the provided dataset samples.
171
     *
172
     * @param string $uniqueid
173
     * @param \stored_file $dataset
174
     * @param string $outputdir
175
     * @return \stdClass
176
     */
177
    public function classify($uniqueid, \stored_file $dataset, $outputdir) {
178
 
179
        $classifier = $this->load_classifier($outputdir);
180
 
181
        $fh = $dataset->get_content_file_handle();
182
 
183
        // The first lines are var names and the second one values.
184
        $metadata = $this->extract_metadata($fh);
185
 
186
        // Skip headers.
187
        fgets($fh);
188
 
189
        $sampleids = array();
190
        $samples = array();
191
        $predictions = array();
192
        while (($data = fgetcsv($fh)) !== false) {
193
            $sampledata = array_map('floatval', $data);
194
            $sampleids[] = $data[0];
195
            $samples[] = array_slice($sampledata, 1, $metadata['nfeatures']);
196
 
197
            if (count($samples) === self::BATCH_SIZE) {
198
                // Prediction it batches to avoid running out of memory.
199
 
200
                // Append predictions incrementally, we want $sampleids keys in sync with $predictions keys.
201
                $newpredictions = $classifier->predict($samples);
202
                foreach ($newpredictions as $prediction) {
203
                    array_push($predictions, $prediction);
204
                }
205
                $samples = array();
206
            }
207
        }
208
        fclose($fh);
209
 
210
        // Finish the remaining predictions.
211
        if ($samples) {
212
            $predictions = $predictions + $classifier->predict($samples);
213
        }
214
 
215
        $resultobj = new \stdClass();
216
        $resultobj->status = \core_analytics\model::OK;
217
        $resultobj->info = array();
218
 
219
        foreach ($predictions as $index => $prediction) {
220
            $resultobj->predictions[$index] = array($sampleids[$index], $prediction);
221
        }
222
 
223
        return $resultobj;
224
    }
225
 
226
    /**
227
     * Evaluates this processor classification model using the provided supervised learning dataset.
228
     *
229
     * During evaluation we need to shuffle the evaluation dataset samples to detect deviated results,
230
     * if the dataset is massive we can not load everything into memory. We know that 2GB is the
231
     * minimum memory limit we should have (\core_analytics\model::heavy_duty_mode), if we substract the memory
232
     * that we already consumed and the memory that Phpml algorithms will need we should still have at
233
     * least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust
234
     * solution that will work for all sites but it should minimize memory limit problems. Site admins
235
     * can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit.
236
     *
237
     * @param string $uniqueid
238
     * @param float $maxdeviation
239
     * @param int $niterations
240
     * @param \stored_file $dataset
241
     * @param string $outputdir
242
     * @param  string $trainedmodeldir
243
     * @return \stdClass
244
     */
245
    public function evaluate_classification($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,
246
            $outputdir, $trainedmodeldir) {
247
        $fh = $dataset->get_content_file_handle();
248
 
249
        if ($trainedmodeldir) {
250
            // We overwrite the number of iterations as the results will always be the same.
251
            $niterations = 1;
252
            $classifier = $this->load_classifier($trainedmodeldir);
253
        }
254
 
255
        // The first lines are var names and the second one values.
256
        $metadata = $this->extract_metadata($fh);
257
 
258
        // Skip headers.
259
        fgets($fh);
260
 
261
        if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
262
            $samplessize = 0;
263
            $limit = get_real_size('500MB');
264
 
265
            // Just an approximation, will depend on PHP version, compile options...
266
            // Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes)
267
            // https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html.
268
            $floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96;
269
        }
270
 
271
        $samples = array();
272
        $targets = array();
273
        while (($data = fgetcsv($fh)) !== false) {
274
            $sampledata = array_map('floatval', $data);
275
 
276
            $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
277
            $targets[] = intval($data[$metadata['nfeatures']]);
278
 
279
            if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
280
                // We allow admins to disable evaluation memory usage limits by modifying config.php.
281
 
282
                // We will have plenty of missing values in the dataset so it should be a conservative approximation.
283
                $samplessize = $samplessize + (count($sampledata) * $floatsize);
284
 
285
                // Stop fetching more samples.
286
                if ($samplessize >= $limit) {
287
                    $this->limitedsize = true;
288
                    break;
289
                }
290
            }
291
        }
292
        fclose($fh);
293
 
294
        // We need at least 2 samples belonging to each target.
295
        $counts = array_count_values($targets);
296
        $ntargets = count(explode(',', $metadata['targetclasses']));
297
        foreach ($counts as $count) {
298
            if ($count < 2) {
299
                $notenoughdata = true;
300
            }
301
        }
302
        if ($ntargets > count($counts)) {
303
            $notenoughdata = true;
304
        }
305
        if (!empty($notenoughdata)) {
306
            $resultobj = new \stdClass();
307
            $resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA;
308
            $resultobj->score = 0;
309
            $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));
310
            return $resultobj;
311
        }
312
 
313
        $scores = array();
314
 
315
        // Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data.
316
        for ($i = 0; $i < $niterations; $i++) {
317
 
318
            if (!$trainedmodeldir) {
319
                $classifier = $this->instantiate_algorithm();
320
 
321
                // Split up the dataset in classifier and testing.
322
                $data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2);
323
 
324
                $classifier->train($data->getTrainSamples(), $data->getTrainLabels());
325
                $predictedlabels = $classifier->predict($data->getTestSamples());
326
                $report = new ClassificationReport($data->getTestLabels(), $predictedlabels,
327
                    ClassificationReport::WEIGHTED_AVERAGE);
328
            } else {
329
                $predictedlabels = $classifier->predict($samples);
330
                $report = new ClassificationReport($targets, $predictedlabels,
331
                    ClassificationReport::WEIGHTED_AVERAGE);
332
            }
333
            $averages = $report->getAverage();
334
            $scores[] = $averages['f1score'];
335
        }
336
 
337
        // Let's fill the results changing the returned status code depending on the phi-related calculated metrics.
338
        return $this->get_evaluation_result_object($dataset, $scores, $maxdeviation);
339
    }
340
 
341
    /**
342
     * Returns the results objects from all evaluations.
343
     *
344
     * @param \stored_file $dataset
345
     * @param array $scores
346
     * @param float $maxdeviation
347
     * @return \stdClass
348
     */
349
    protected function get_evaluation_result_object(\stored_file $dataset, $scores, $maxdeviation) {
350
 
351
        // Average f1 score of all evaluations as final score.
352
        if (count($scores) === 1) {
353
            $avgscore = reset($scores);
354
        } else {
355
            $avgscore = \Phpml\Math\Statistic\Mean::arithmetic($scores);
356
        }
357
 
358
        // Standard deviation should ideally be calculated against the area under the curve.
359
        if (count($scores) === 1) {
360
            $modeldev = 0;
361
        } else {
362
            $modeldev = \Phpml\Math\Statistic\StandardDeviation::population($scores);
363
        }
364
 
365
        // Let's fill the results object.
366
        $resultobj = new \stdClass();
367
 
368
        // Zero is ok, now we add other bits if something is not right.
369
        $resultobj->status = \core_analytics\model::OK;
370
        $resultobj->info = array();
371
        $resultobj->score = $avgscore;
372
 
373
        // If each iteration results varied too much we need more data to confirm that this is a valid model.
374
        if ($modeldev > $maxdeviation) {
375
            $resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA;
376
            $a = new \stdClass();
377
            $a->deviation = $modeldev;
378
            $a->accepteddeviation = $maxdeviation;
379
            $resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a);
380
        }
381
 
382
        if ($resultobj->score < \core_analytics\model::MIN_SCORE) {
383
            $resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE;
384
            $a = new \stdClass();
385
            $a->score = $resultobj->score;
386
            $a->minscore = \core_analytics\model::MIN_SCORE;
387
            $resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a);
388
        }
389
 
390
        if ($this->limitedsize === true) {
391
            $resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize()));
392
        }
393
 
394
        return $resultobj;
395
    }
396
 
397
    /**
398
     * Loads the pre-trained classifier.
399
     *
400
     * @throws \moodle_exception
401
     * @param string $outputdir
402
     * @return \Phpml\Classification\Linear\LogisticRegression
403
     */
404
    protected function load_classifier($outputdir) {
405
        $modelfilepath = $this->get_model_filepath($outputdir);
406
 
407
        if (!file_exists($modelfilepath)) {
408
            throw new \moodle_exception('errorcantloadmodel', 'mlbackend_php', '', $modelfilepath);
409
        }
410
 
411
        $modelmanager = new ModelManager();
412
        return $modelmanager->restoreFromFile($modelfilepath);
413
    }
414
 
415
    /**
416
     * Train this processor regression model using the provided supervised learning dataset.
417
     *
418
     * @throws new \coding_exception
419
     * @param string $uniqueid
420
     * @param \stored_file $dataset
421
     * @param string $outputdir
422
     * @return \stdClass
423
     */
424
    public function train_regression($uniqueid, \stored_file $dataset, $outputdir) {
425
        throw new \coding_exception('This predictor does not support regression yet.');
426
    }
427
 
428
    /**
429
     * Estimates linear values for the provided dataset samples.
430
     *
431
     * @throws new \coding_exception
432
     * @param string $uniqueid
433
     * @param \stored_file $dataset
434
     * @param mixed $outputdir
435
     * @return void
436
     */
437
    public function estimate($uniqueid, \stored_file $dataset, $outputdir) {
438
        throw new \coding_exception('This predictor does not support regression yet.');
439
    }
440
 
441
    /**
442
     * Evaluates this processor regression model using the provided supervised learning dataset.
443
     *
444
     * @throws new \coding_exception
445
     * @param string $uniqueid
446
     * @param float $maxdeviation
447
     * @param int $niterations
448
     * @param \stored_file $dataset
449
     * @param string $outputdir
450
     * @param  string $trainedmodeldir
451
     * @return \stdClass
452
     */
453
    public function evaluate_regression($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,
454
            $outputdir, $trainedmodeldir) {
455
        throw new \coding_exception('This predictor does not support regression yet.');
456
    }
457
 
458
    /**
459
     * Exports the machine learning model.
460
     *
461
     * @throws \moodle_exception
462
     * @param  string $uniqueid  The model unique id
463
     * @param  string $modeldir  The directory that contains the trained model.
464
     * @return string            The path to the directory that contains the exported model.
465
     */
466
    public function export(string $uniqueid, string $modeldir): string {
467
 
468
        $modelfilepath = $this->get_model_filepath($modeldir);
469
 
470
        if (!file_exists($modelfilepath)) {
471
            throw new \moodle_exception('errorexportmodelresult', 'analytics');
472
        }
473
 
474
        // We can use the actual $modeldir as the directory is not modified during export, just copied into a zip.
475
        return $modeldir;
476
    }
477
 
478
    /**
479
     * Imports the provided machine learning model.
480
     *
481
     * @param  string $uniqueid The model unique id
482
     * @param  string $modeldir  The directory that will contain the trained model.
483
     * @param  string $importdir The directory that contains the files to import.
484
     * @return bool Success
485
     */
486
    public function import(string $uniqueid, string $modeldir, string $importdir): bool {
487
 
488
        $importmodelfilepath = $this->get_model_filepath($importdir);
489
        $modelfilepath = $this->get_model_filepath($modeldir);
490
 
491
        $modelmanager = new ModelManager();
492
 
493
        // Copied from ModelManager::restoreFromFile to validate the serialised contents
494
        // before restoring them.
495
        $importconfig = file_get_contents($importmodelfilepath);
496
 
497
        // Clean stuff like function calls.
498
        $importconfig = preg_replace('/[^a-zA-Z0-9\{\}%\.\*\;\,\:\"\-\0\\\]/', '', $importconfig);
499
 
500
        $object = unserialize($importconfig,
501
            ['allowed_classes' => ['Phpml\\Classification\\Linear\\LogisticRegression']]);
502
        if (!$object) {
503
            return false;
504
        }
505
 
506
        if (get_class($object) == '__PHP_Incomplete_Class') {
507
            return false;
508
        }
509
 
510
        $classifier = $modelmanager->restoreFromFile($importmodelfilepath);
511
 
512
        // This would override any previous classifier.
513
        $modelmanager->saveToFile($classifier, $modelfilepath);
514
 
515
        return true;
516
    }
517
 
518
    /**
519
     * Returns the path to the serialised model file in the provided directory.
520
     *
521
     * @param  string $modeldir The model directory
522
     * @return string           The model file
523
     */
524
    protected function get_model_filepath(string $modeldir): string {
525
        // Output directory is already unique to the model.
526
        return $modeldir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME;
527
    }
528
 
529
    /**
530
     * Extracts metadata from the dataset file.
531
     *
532
     * The file poiter should be located at the top of the file.
533
     *
534
     * @param resource $fh
535
     * @return array
536
     */
537
    protected function extract_metadata($fh) {
538
        $metadata = fgetcsv($fh);
539
        return array_combine($metadata, fgetcsv($fh));
540
    }
541
 
542
    /**
543
     * Instantiates the ML algorithm.
544
     *
545
     * @return \Phpml\Classification\Linear\LogisticRegression
546
     */
547
    protected function instantiate_algorithm(): \Phpml\Classification\Linear\LogisticRegression {
548
        return new LogisticRegression(self::TRAIN_ITERATIONS, true,
549
            LogisticRegression::CONJUGATE_GRAD_TRAINING, 'log');
550
    }
551
}