WebSVN – Moodle – Autoría – /lib/mlbackend/php/classes/processor.php

Rev	Autor	Línea Nro.	Línea
1	efrain	1	`<?php`
		2	`// This file is part of Moodle - http://moodle.org/`
		3	`//`
		4	`// Moodle is free software: you can redistribute it and/or modify`
		5	`// it under the terms of the GNU General Public License as published by`
		6	`// the Free Software Foundation, either version 3 of the License, or`
		7	`// (at your option) any later version.`
		8	`//`
		9	`// Moodle is distributed in the hope that it will be useful,`
		10	`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
		11	`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
		12	`// GNU General Public License for more details.`
		13	`//`
		14	`// You should have received a copy of the GNU General Public License`
		15	`// along with Moodle. If not, see <http://www.gnu.org/licenses/>.`
		16
		17	`/**`
		18	`* Php predictions processor`
		19	`*`
		20	`* @package mlbackend_php`
		21	`* @copyright 2016 David Monllao {@link http://www.davidmonllao.com}`
		22	`* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later`
		23	`*/`
		24
		25	`namespace mlbackend_php;`
		26
		27	`defined('MOODLE_INTERNAL') \|\| die();`
		28
		29	`use Phpml\Preprocessing\Normalizer;`
		30	`use Phpml\CrossValidation\RandomSplit;`
		31	`use Phpml\Dataset\ArrayDataset;`
		32	`use Phpml\ModelManager;`
		33	`use Phpml\Classification\Linear\LogisticRegression;`
		34	`use Phpml\Metric\ClassificationReport;`
		35
		36	`/**`
		37	`* PHP predictions processor.`
		38	`*`
		39	`* @package mlbackend_php`
		40	`* @copyright 2016 David Monllao {@link http://www.davidmonllao.com}`
		41	`* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later`
		42	`*/`
		43	`class processor implements \core_analytics\classifier, \core_analytics\regressor, \core_analytics\packable {`
		44
		45	`/**`
		46	`* Size of training / prediction batches.`
		47	`*/`
		48	`const BATCH_SIZE = 5000;`
		49
		50	`/**`
		51	`* Number of train iterations.`
		52	`*/`
		53	`const TRAIN_ITERATIONS = 500;`
		54
		55	`/**`
		56	`* File name of the serialised model.`
		57	`*/`
		58	`const MODEL_FILENAME = 'model.ser';`
		59
		60	`/**`
		61	`* @var bool`
		62	`*/`
		63	`protected $limitedsize = false;`
		64
		65	`/**`
		66	`* Checks if the processor is ready to use.`
		67	`*`
		68	`* @return bool`
		69	`*/`
		70	`public function is_ready() {`
		71	`if (version_compare(phpversion(), '7.0.0') < 0) {`
		72	`return get_string('errorphp7required', 'mlbackend_php');`
		73	`}`
		74	`return true;`
		75	`}`
		76
		77	`/**`
		78	`* Delete the stored models.`
		79	`*`
		80	`* @param string $uniqueid`
		81	`* @param string $modelversionoutputdir`
		82	`* @return null`
		83	`*/`
		84	`public function clear_model($uniqueid, $modelversionoutputdir) {`
		85	`remove_dir($modelversionoutputdir);`
		86	`}`
		87
		88	`/**`
		89	`* Delete the output directory.`
		90	`*`
		91	`* @param string $modeloutputdir`
		92	`* @param string $uniqueid`
		93	`* @return null`
		94	`*/`
		95	`public function delete_output_dir($modeloutputdir, $uniqueid) {`
		96	`remove_dir($modeloutputdir);`
		97	`}`
		98
		99	`/**`
		100	`* Train this processor classification model using the provided supervised learning dataset.`
		101	`*`
		102	`* @param string $uniqueid`
		103	`* @param \stored_file $dataset`
		104	`* @param string $outputdir`
		105	`* @return \stdClass`
		106	`*/`
		107	`public function train_classification($uniqueid, \stored_file $dataset, $outputdir) {`
		108
		109	`$modelfilepath = $this->get_model_filepath($outputdir);`
		110
		111	`$modelmanager = new ModelManager();`
		112
		113	`if (file_exists($modelfilepath)) {`
		114	`$classifier = $modelmanager->restoreFromFile($modelfilepath);`
		115	`} else {`
		116	`$classifier = $this->instantiate_algorithm();`
		117	`}`
		118
		119	`$fh = $dataset->get_content_file_handle();`
		120
		121	`// The first lines are var names and the second one values.`
		122	`$metadata = $this->extract_metadata($fh);`
		123
		124	`// Skip headers.`
		125	`fgets($fh);`
		126
		127	`$samples = array();`
		128	`$targets = array();`
		129	`while (($data = fgetcsv($fh)) !== false) {`
		130	`$sampledata = array_map('floatval', $data);`
		131	`$samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);`
		132	`$targets[] = intval($data[$metadata['nfeatures']]);`
		133
		134	`$nsamples = count($samples);`
		135	`if ($nsamples === self::BATCH_SIZE) {`
		136	`// Training it batches to avoid running out of memory.`
		137	`$classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));`
		138	`$samples = array();`
		139	`$targets = array();`
		140	`}`
		141	`if (empty($morethan1sample) && $nsamples > 1) {`
		142	`$morethan1sample = true;`
		143	`}`
		144	`}`
		145	`fclose($fh);`
		146
		147	`if (empty($morethan1sample)) {`
		148	`$resultobj = new \stdClass();`
		149	`$resultobj->status = \core_analytics\model::NO_DATASET;`
		150	`$resultobj->info = array();`
		151	`return $resultobj;`
		152	`}`
		153
		154	`// Train the remaining samples.`
		155	`if ($samples) {`
		156	`$classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));`
		157	`}`
		158
		159	`$resultobj = new \stdClass();`
		160	`$resultobj->status = \core_analytics\model::OK;`
		161	`$resultobj->info = array();`
		162
		163	`// Store the trained model.`
		164	`$modelmanager->saveToFile($classifier, $modelfilepath);`
		165
		166	`return $resultobj;`
		167	`}`
		168
		169	`/**`
		170	`* Classifies the provided dataset samples.`
		171	`*`
		172	`* @param string $uniqueid`
		173	`* @param \stored_file $dataset`
		174	`* @param string $outputdir`
		175	`* @return \stdClass`
		176	`*/`
		177	`public function classify($uniqueid, \stored_file $dataset, $outputdir) {`
		178
		179	`$classifier = $this->load_classifier($outputdir);`
		180
		181	`$fh = $dataset->get_content_file_handle();`
		182
		183	`// The first lines are var names and the second one values.`
		184	`$metadata = $this->extract_metadata($fh);`
		185
		186	`// Skip headers.`
		187	`fgets($fh);`
		188
		189	`$sampleids = array();`
		190	`$samples = array();`
		191	`$predictions = array();`
		192	`while (($data = fgetcsv($fh)) !== false) {`
		193	`$sampledata = array_map('floatval', $data);`
		194	`$sampleids[] = $data[0];`
		195	`$samples[] = array_slice($sampledata, 1, $metadata['nfeatures']);`
		196
		197	`if (count($samples) === self::BATCH_SIZE) {`
		198	`// Prediction it batches to avoid running out of memory.`
		199
		200	`// Append predictions incrementally, we want $sampleids keys in sync with $predictions keys.`
		201	`$newpredictions = $classifier->predict($samples);`
		202	`foreach ($newpredictions as $prediction) {`
		203	`array_push($predictions, $prediction);`
		204	`}`
		205	`$samples = array();`
		206	`}`
		207	`}`
		208	`fclose($fh);`
		209
		210	`// Finish the remaining predictions.`
		211	`if ($samples) {`
		212	`$predictions = $predictions + $classifier->predict($samples);`
		213	`}`
		214
		215	`$resultobj = new \stdClass();`
		216	`$resultobj->status = \core_analytics\model::OK;`
		217	`$resultobj->info = array();`
		218
		219	`foreach ($predictions as $index => $prediction) {`
		220	`$resultobj->predictions[$index] = array($sampleids[$index], $prediction);`
		221	`}`
		222
		223	`return $resultobj;`
		224	`}`
		225
		226	`/**`
		227	`* Evaluates this processor classification model using the provided supervised learning dataset.`
		228	`*`
		229	`* During evaluation we need to shuffle the evaluation dataset samples to detect deviated results,`
		230	`* if the dataset is massive we can not load everything into memory. We know that 2GB is the`
		231	`* minimum memory limit we should have (\core_analytics\model::heavy_duty_mode), if we substract the memory`
		232	`* that we already consumed and the memory that Phpml algorithms will need we should still have at`
		233	`* least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust`
		234	`* solution that will work for all sites but it should minimize memory limit problems. Site admins`
		235	`* can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit.`
		236	`*`
		237	`* @param string $uniqueid`
		238	`* @param float $maxdeviation`
		239	`* @param int $niterations`
		240	`* @param \stored_file $dataset`
		241	`* @param string $outputdir`
		242	`* @param string $trainedmodeldir`
		243	`* @return \stdClass`
		244	`*/`
		245	`public function evaluate_classification($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,`
		246	`$outputdir, $trainedmodeldir) {`
		247	`$fh = $dataset->get_content_file_handle();`
		248
		249	`if ($trainedmodeldir) {`
		250	`// We overwrite the number of iterations as the results will always be the same.`
		251	`$niterations = 1;`
		252	`$classifier = $this->load_classifier($trainedmodeldir);`
		253	`}`
		254
		255	`// The first lines are var names and the second one values.`
		256	`$metadata = $this->extract_metadata($fh);`
		257
		258	`// Skip headers.`
		259	`fgets($fh);`
		260
		261	`if (empty($CFG->mlbackend_php_no_evaluation_limits)) {`
		262	`$samplessize = 0;`
		263	`$limit = get_real_size('500MB');`
		264
		265	`// Just an approximation, will depend on PHP version, compile options...`
		266	`// Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes)`
		267	`// https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html.`
		268	`$floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96;`
		269	`}`
		270
		271	`$samples = array();`
		272	`$targets = array();`
		273	`while (($data = fgetcsv($fh)) !== false) {`
		274	`$sampledata = array_map('floatval', $data);`
		275
		276	`$samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);`
		277	`$targets[] = intval($data[$metadata['nfeatures']]);`
		278
		279	`if (empty($CFG->mlbackend_php_no_evaluation_limits)) {`
		280	`// We allow admins to disable evaluation memory usage limits by modifying config.php.`
		281
		282	`// We will have plenty of missing values in the dataset so it should be a conservative approximation.`
		283	`$samplessize = $samplessize + (count($sampledata) * $floatsize);`
		284
		285	`// Stop fetching more samples.`
		286	`if ($samplessize >= $limit) {`
		287	`$this->limitedsize = true;`
		288	`break;`
		289	`}`
		290	`}`
		291	`}`
		292	`fclose($fh);`
		293
		294	`// We need at least 2 samples belonging to each target.`
		295	`$counts = array_count_values($targets);`
		296	`$ntargets = count(explode(',', $metadata['targetclasses']));`
		297	`foreach ($counts as $count) {`
		298	`if ($count < 2) {`
		299	`$notenoughdata = true;`
		300	`}`
		301	`}`
		302	`if ($ntargets > count($counts)) {`
		303	`$notenoughdata = true;`
		304	`}`
		305	`if (!empty($notenoughdata)) {`
		306	`$resultobj = new \stdClass();`
		307	`$resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA;`
		308	`$resultobj->score = 0;`
		309	`$resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));`
		310	`return $resultobj;`
		311	`}`
		312
		313	`$scores = array();`
		314
		315	`// Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data.`
		316	`for ($i = 0; $i < $niterations; $i++) {`
		317
		318	`if (!$trainedmodeldir) {`
		319	`$classifier = $this->instantiate_algorithm();`
		320
		321	`// Split up the dataset in classifier and testing.`
		322	`$data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2);`
		323
		324	`$classifier->train($data->getTrainSamples(), $data->getTrainLabels());`
		325	`$predictedlabels = $classifier->predict($data->getTestSamples());`
		326	`$report = new ClassificationReport($data->getTestLabels(), $predictedlabels,`
		327	`ClassificationReport::WEIGHTED_AVERAGE);`
		328	`} else {`
		329	`$predictedlabels = $classifier->predict($samples);`
		330	`$report = new ClassificationReport($targets, $predictedlabels,`
		331	`ClassificationReport::WEIGHTED_AVERAGE);`
		332	`}`
		333	`$averages = $report->getAverage();`
		334	`$scores[] = $averages['f1score'];`
		335	`}`
		336
		337	`// Let's fill the results changing the returned status code depending on the phi-related calculated metrics.`
		338	`return $this->get_evaluation_result_object($dataset, $scores, $maxdeviation);`
		339	`}`
		340
		341	`/**`
		342	`* Returns the results objects from all evaluations.`
		343	`*`
		344	`* @param \stored_file $dataset`
		345	`* @param array $scores`
		346	`* @param float $maxdeviation`
		347	`* @return \stdClass`
		348	`*/`
		349	`protected function get_evaluation_result_object(\stored_file $dataset, $scores, $maxdeviation) {`
		350
		351	`// Average f1 score of all evaluations as final score.`
		352	`if (count($scores) === 1) {`
		353	`$avgscore = reset($scores);`
		354	`} else {`
		355	`$avgscore = \Phpml\Math\Statistic\Mean::arithmetic($scores);`
		356	`}`
		357
		358	`// Standard deviation should ideally be calculated against the area under the curve.`
		359	`if (count($scores) === 1) {`
		360	`$modeldev = 0;`
		361	`} else {`
		362	`$modeldev = \Phpml\Math\Statistic\StandardDeviation::population($scores);`
		363	`}`
		364
		365	`// Let's fill the results object.`
		366	`$resultobj = new \stdClass();`
		367
		368	`// Zero is ok, now we add other bits if something is not right.`
		369	`$resultobj->status = \core_analytics\model::OK;`
		370	`$resultobj->info = array();`
		371	`$resultobj->score = $avgscore;`
		372
		373	`// If each iteration results varied too much we need more data to confirm that this is a valid model.`
		374	`if ($modeldev > $maxdeviation) {`
		375	`$resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA;`
		376	`$a = new \stdClass();`
		377	`$a->deviation = $modeldev;`
		378	`$a->accepteddeviation = $maxdeviation;`
		379	`$resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a);`
		380	`}`
		381
		382	`if ($resultobj->score < \core_analytics\model::MIN_SCORE) {`
		383	`$resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE;`
		384	`$a = new \stdClass();`
		385	`$a->score = $resultobj->score;`
		386	`$a->minscore = \core_analytics\model::MIN_SCORE;`
		387	`$resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a);`
		388	`}`
		389
		390	`if ($this->limitedsize === true) {`
		391	`$resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize()));`
		392	`}`
		393
		394	`return $resultobj;`
		395	`}`
		396
		397	`/**`
		398	`* Loads the pre-trained classifier.`
		399	`*`
		400	`* @throws \moodle_exception`
		401	`* @param string $outputdir`
		402	`* @return \Phpml\Classification\Linear\LogisticRegression`
		403	`*/`
		404	`protected function load_classifier($outputdir) {`
		405	`$modelfilepath = $this->get_model_filepath($outputdir);`
		406
		407	`if (!file_exists($modelfilepath)) {`
		408	`throw new \moodle_exception('errorcantloadmodel', 'mlbackend_php', '', $modelfilepath);`
		409	`}`
		410
		411	`$modelmanager = new ModelManager();`
		412	`return $modelmanager->restoreFromFile($modelfilepath);`
		413	`}`
		414
		415	`/**`
		416	`* Train this processor regression model using the provided supervised learning dataset.`
		417	`*`
		418	`* @throws new \coding_exception`
		419	`* @param string $uniqueid`
		420	`* @param \stored_file $dataset`
		421	`* @param string $outputdir`
		422	`* @return \stdClass`
		423	`*/`
		424	`public function train_regression($uniqueid, \stored_file $dataset, $outputdir) {`
		425	`throw new \coding_exception('This predictor does not support regression yet.');`
		426	`}`
		427
		428	`/**`
		429	`* Estimates linear values for the provided dataset samples.`
		430	`*`
		431	`* @throws new \coding_exception`
		432	`* @param string $uniqueid`
		433	`* @param \stored_file $dataset`
		434	`* @param mixed $outputdir`
		435	`* @return void`
		436	`*/`
		437	`public function estimate($uniqueid, \stored_file $dataset, $outputdir) {`
		438	`throw new \coding_exception('This predictor does not support regression yet.');`
		439	`}`
		440
		441	`/**`
		442	`* Evaluates this processor regression model using the provided supervised learning dataset.`
		443	`*`
		444	`* @throws new \coding_exception`
		445	`* @param string $uniqueid`
		446	`* @param float $maxdeviation`
		447	`* @param int $niterations`
		448	`* @param \stored_file $dataset`
		449	`* @param string $outputdir`
		450	`* @param string $trainedmodeldir`
		451	`* @return \stdClass`
		452	`*/`
		453	`public function evaluate_regression($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,`
		454	`$outputdir, $trainedmodeldir) {`
		455	`throw new \coding_exception('This predictor does not support regression yet.');`
		456	`}`
		457
		458	`/**`
		459	`* Exports the machine learning model.`
		460	`*`
		461	`* @throws \moodle_exception`
		462	`* @param string $uniqueid The model unique id`
		463	`* @param string $modeldir The directory that contains the trained model.`
		464	`* @return string The path to the directory that contains the exported model.`
		465	`*/`
		466	`public function export(string $uniqueid, string $modeldir): string {`
		467
		468	`$modelfilepath = $this->get_model_filepath($modeldir);`
		469
		470	`if (!file_exists($modelfilepath)) {`
		471	`throw new \moodle_exception('errorexportmodelresult', 'analytics');`
		472	`}`
		473
		474	`// We can use the actual $modeldir as the directory is not modified during export, just copied into a zip.`
		475	`return $modeldir;`
		476	`}`
		477
		478	`/**`
		479	`* Imports the provided machine learning model.`
		480	`*`
		481	`* @param string $uniqueid The model unique id`
		482	`* @param string $modeldir The directory that will contain the trained model.`
		483	`* @param string $importdir The directory that contains the files to import.`
		484	`* @return bool Success`
		485	`*/`
		486	`public function import(string $uniqueid, string $modeldir, string $importdir): bool {`
		487
		488	`$importmodelfilepath = $this->get_model_filepath($importdir);`
		489	`$modelfilepath = $this->get_model_filepath($modeldir);`
		490
		491	`$modelmanager = new ModelManager();`
		492
		493	`// Copied from ModelManager::restoreFromFile to validate the serialised contents`
		494	`// before restoring them.`
		495	`$importconfig = file_get_contents($importmodelfilepath);`
		496
		497	`// Clean stuff like function calls.`
		498	`$importconfig = preg_replace('/[^a-zA-Z0-9\{\}%\.\*\;\,\:\"\-\0\\\]/', '', $importconfig);`
		499
		500	`$object = unserialize($importconfig,`
		501	`['allowed_classes' => ['Phpml\\Classification\\Linear\\LogisticRegression']]);`
		502	`if (!$object) {`
		503	`return false;`
		504	`}`
		505
		506	`if (get_class($object) == '__PHP_Incomplete_Class') {`
		507	`return false;`
		508	`}`
		509
		510	`$classifier = $modelmanager->restoreFromFile($importmodelfilepath);`
		511
		512	`// This would override any previous classifier.`
		513	`$modelmanager->saveToFile($classifier, $modelfilepath);`
		514
		515	`return true;`
		516	`}`
		517
		518	`/**`
		519	`* Returns the path to the serialised model file in the provided directory.`
		520	`*`
		521	`* @param string $modeldir The model directory`
		522	`* @return string The model file`
		523	`*/`
		524	`protected function get_model_filepath(string $modeldir): string {`
		525	`// Output directory is already unique to the model.`
		526	`return $modeldir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME;`
		527	`}`
		528
		529	`/**`
		530	`* Extracts metadata from the dataset file.`
		531	`*`
		532	`* The file poiter should be located at the top of the file.`
		533	`*`
		534	`* @param resource $fh`
		535	`* @return array`
		536	`*/`
		537	`protected function extract_metadata($fh) {`
		538	`$metadata = fgetcsv($fh);`
		539	`return array_combine($metadata, fgetcsv($fh));`
		540	`}`
		541
		542	`/**`
		543	`* Instantiates the ML algorithm.`
		544	`*`
		545	`* @return \Phpml\Classification\Linear\LogisticRegression`
		546	`*/`
		547	`protected function instantiate_algorithm(): \Phpml\Classification\Linear\LogisticRegression {`
		548	`return new LogisticRegression(self::TRAIN_ITERATIONS, true,`
		549	`LogisticRegression::CONJUGATE_GRAD_TRAINING, 'log');`
		550	`}`
		551	`}`

Proyectos de Subversion Moodle

(root)/lib/mlbackend/php/classes/processor.php – Rev 1