Ir a la última revisión | Autoría | Comparar con el anterior | Ultima modificación | Ver Log |
<?php// This file is part of Moodle - http://moodle.org///// Moodle is free software: you can redistribute it and/or modify// it under the terms of the GNU General Public License as published by// the Free Software Foundation, either version 3 of the License, or// (at your option) any later version.//// Moodle is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the// GNU General Public License for more details.//// You should have received a copy of the GNU General Public License// along with Moodle. If not, see <http://www.gnu.org/licenses/>./*** Datasets manager.** @package core_analytics* @copyright 2016 David Monllao {@link http://www.davidmonllao.com}* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later*/namespace core_analytics;defined('MOODLE_INTERNAL') || die();/*** Datasets manager.** @package core_analytics* @copyright 2016 David Monllao {@link http://www.davidmonllao.com}* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later*/class dataset_manager {/*** File area for labelled datasets.*/const LABELLED_FILEAREA = 'labelled';/*** File area for unlabelled datasets.*/const UNLABELLED_FILEAREA = 'unlabelled';/*** File area for exported datasets.*/const EXPORT_FILEAREA = 'export';/*** Evaluation file file name.*/const EVALUATION_FILENAME = 'evaluation.csv';/*** The model id.** @var int*/protected $modelid;/*** Range processor in use.** @var string*/protected $timesplittingid;/*** @var int*/protected $analysableid;/*** Whether this is a dataset for evaluation or not.** @var bool*/protected $evaluation;/*** The dataset filearea. Must be one of the self::*_FILEAREA options.** @var string*/protected $filearea;/*** Constructor method.** @throws \coding_exception* @param int $modelid* @param int $analysableid* @param string $timesplittingid* @param string $filearea* @param bool $evaluation* @return void*/public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) {if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA &&$filearea !== self::UNLABELLED_FILEAREA) {throw new \coding_exception('Invalid provided filearea');}$this->modelid = $modelid;$this->analysableid = $analysableid;$this->timesplittingid = $timesplittingid;$this->filearea = $filearea;$this->evaluation = $evaluation;}/*** Store the dataset in the internal file system.** @param array $data* @return \stored_file*/public function store($data) {// Delete previous file if it exists.$fs = get_file_storage();$filerecord = ['component' => 'analytics','filearea' => $this->filearea,'itemid' => $this->modelid,'contextid' => \context_system::instance()->id,'filepath' => '/analysable/' . $this->analysableid . '/' .\core_analytics\analysis::clean_time_splitting_id($this->timesplittingid) . '/','filename' => self::get_filename($this->evaluation)];// Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable.if ($this->evaluation) {$select = " = {$filerecord['itemid']} AND filepath = :filepath";$fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'],$select, array('filepath' => $filerecord['filepath']));}// Write all this stuff to a tmp file.$filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename'];$fh = fopen($filepath, 'w+');if (!$fh) {return false;}foreach ($data as $line) {fputcsv($fh, $line);}fclose($fh);return $fs->create_file_from_pathname($filerecord, $filepath);}/*** Returns the previous evaluation file.** Important to note that this is per modelid + timesplittingid, when dealing with multiple* analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file** @param int $modelid* @param string $timesplittingid* @return \stored_file*/public static function get_previous_evaluation_file($modelid, $timesplittingid) {$fs = get_file_storage();// Evaluation data is always labelled.$filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid,$filepath, self::EVALUATION_FILENAME);}/*** Gets the list of files that couldn't be previously used for training and prediction.** @param int $modelid* @param bool $includetarget* @param string[] $timesplittingids* @return null*/public static function get_pending_files($modelid, $includetarget, $timesplittingids) {global $DB;$fs = get_file_storage();if ($includetarget) {$filearea = self::LABELLED_FILEAREA;$usedfileaction = 'trained';} else {$filearea = self::UNLABELLED_FILEAREA;$usedfileaction = 'predicted';}$select = 'modelid = :modelid AND action = :action';$params = array('modelid' => $modelid, 'action' => $usedfileaction);$usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);// Very likely that we will only have 1 time splitting method here.$filesbytimesplitting = array();foreach ($timesplittingids as $timesplittingid) {$filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';$files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);foreach ($files as $file) {// Discard evaluation files.if ($file->get_filename() === self::EVALUATION_FILENAME) {continue;}// No dirs.if ($file->is_directory()) {continue;}// Already used for training.if (in_array($file->get_id(), $usedfileids)) {continue;}$filesbytimesplitting[$timesplittingid][] = $file;}}return $filesbytimesplitting;}/*** Deletes previous evaluation files of this model.** @param int $modelid* @param string $timesplittingid* @return bool*/public static function delete_previous_evaluation_file($modelid, $timesplittingid) {if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) {$file->delete();return true;}return false;}/*** Returns this (model + analysable + time splitting) file.** @param int $modelid* @param int $analysableid* @param string $timesplittingid* @return \stored_file*/public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) {// Delete previous file if it exists.$fs = get_file_storage();// Always evaluation.csv and labelled as it is an evaluation file.$filearea = self::LABELLED_FILEAREA;$filename = self::get_filename(true);$filepath = '/analysable/' . $analysableid . '/' .\core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename);}/*** Merge multiple files into one.** Important! It is the caller responsability to ensure that the datasets are compatible.** @param array $files* @param int $modelid* @param string $timesplittingid* @param string $filearea* @param bool $evaluation* @return \stored_file*/public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) {$tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv';// Add headers.// We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file// once all file contents are merged.$varnames = '';$analysablesvalues = array();foreach ($files as $file) {$rh = $file->get_content_file_handle();// Copy the var names as they are, all files should have the same var names.$varnames = fgetcsv($rh);$analysablesvalues[] = fgetcsv($rh);// Copy the columns as they are, all files should have the same columns.$columns = fgetcsv($rh);}// Merge analysable values skipping the ones that are the same in all analysables.$values = array();foreach ($analysablesvalues as $analysablevalues) {foreach ($analysablevalues as $varkey => $value) {// Sha1 to make it unique.$values[$varkey][sha1($value)] = $value;}}foreach ($values as $varkey => $varvalues) {$values[$varkey] = implode('|', $varvalues);}// Start writing to the merge file.$wh = fopen($tmpfilepath, 'w');if (!$wh) {throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);}fputcsv($wh, $varnames);fputcsv($wh, $values);fputcsv($wh, $columns);// Iterate through all files and add them to the tmp one. We don't want file contents in memory.foreach ($files as $file) {$rh = $file->get_content_file_handle();// Skip headers.fgets($rh);fgets($rh);fgets($rh);// Copy all the following lines.while ($line = fgets($rh)) {fwrite($wh, $line);}fclose($rh);}fclose($wh);$filerecord = ['component' => 'analytics','filearea' => $filearea,'itemid' => $modelid,'contextid' => \context_system::instance()->id,'filepath' => '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/','filename' => self::get_filename($evaluation)];$fs = get_file_storage();return $fs->create_file_from_pathname($filerecord, $tmpfilepath);}/*** Exports the model training data.** @param int $modelid* @param string $timesplittingid* @return \stored_file|false*/public static function export_training_data($modelid, $timesplittingid) {$fs = get_file_storage();$contextid = \context_system::instance()->id;$filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';$files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid,$filepath, true, false);// Discard evaluation files.foreach ($files as $key => $file) {if ($file->get_filename() === self::EVALUATION_FILENAME) {unset($files[$key]);}}if (empty($files)) {return false;}return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA);}/*** Returns the dataset file data structured by sampleids using the indicators and target column names.** @param \stored_file $dataset* @return array*/public static function get_structured_data(\stored_file $dataset) {if ($dataset->get_filearea() !== 'unlabelled') {throw new \coding_exception('Sorry, only support for unlabelled data');}$rh = $dataset->get_content_file_handle();// Skip dataset info.fgets($rh);fgets($rh);$calculations = array();$headers = fgetcsv($rh);// Get rid of the sampleid column name.array_shift($headers);while ($columns = fgetcsv($rh)) {$uniquesampleid = array_shift($columns);// Unfortunately fgetcsv does not respect line's var types.$calculations[$uniquesampleid] = array_map(function($value) {if ($value === '') {// We really want them as null because converted to float become 0// and we need to treat the values separately.return null;} else if (is_numeric($value)) {return floatval($value);}return $value;}, array_combine($headers, $columns));}return $calculations;}/*** Delete all files of a model.** @param int $modelid* @return bool*/public static function clear_model_files($modelid) {$fs = get_file_storage();return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid);}/*** Returns the file name to be used.** @param strinbool $evaluation* @return string*/protected static function get_filename($evaluation) {if ($evaluation === true) {$filename = self::EVALUATION_FILENAME;} else {// Incremental time, the lock will make sure we don't have concurrency problems.$filename = microtime(true) . '.csv';}return $filename;}}