Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
// This file is part of Moodle - http://moodle.org/
3
//
4
// Moodle is free software: you can redistribute it and/or modify
5
// it under the terms of the GNU General Public License as published by
6
// the Free Software Foundation, either version 3 of the License, or
7
// (at your option) any later version.
8
//
9
// Moodle is distributed in the hope that it will be useful,
10
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
// GNU General Public License for more details.
13
//
14
// You should have received a copy of the GNU General Public License
15
// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
 
17
/**
18
 * Runs an analysis of the site.
19
 *
20
 * @package   core_analytics
21
 * @copyright 2019 David Monllao {@link http://www.davidmonllao.com}
22
 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23
 */
24
 
25
namespace core_analytics;
26
 
27
defined('MOODLE_INTERNAL') || die();
28
 
29
/**
30
 * Runs an analysis of the site.
31
 *
32
 * @package   core_analytics
33
 * @copyright 2019 David Monllao {@link http://www.davidmonllao.com}
34
 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35
 */
36
class analysis {
37
 
38
    /**
39
     * @var \core_analytics\local\analyser\base
40
     */
41
    private $analyser;
42
 
43
    /**
44
     * @var bool Whether to calculate the target or not in this run.
45
     */
46
    private $includetarget;
47
 
48
    /**
49
     * @var \core_analytics\local\analysis\result
50
     */
51
    private $result;
52
 
53
    /**
54
     * @var \core\lock\lock
55
     */
56
    private $lock;
57
 
58
    /**
59
     * Constructor.
60
     *
61
     * @param \core_analytics\local\analyser\base   $analyser
62
     * @param bool                                  $includetarget Whether to calculate the target or not.
63
     * @param \core_analytics\local\analysis\result $result
64
     */
65
    public function __construct(\core_analytics\local\analyser\base $analyser, bool $includetarget,
66
            \core_analytics\local\analysis\result $result) {
67
        $this->analyser = $analyser;
68
        $this->includetarget = $includetarget;
69
        $this->result = $result;
70
 
71
        // We cache the first time analysables were analysed because time-splitting methods can depend on these info.
72
        self::fill_firstanalyses_cache($this->analyser->get_modelid());
73
    }
74
 
75
    /**
76
     * Runs the analysis.
77
     *
78
     * @param \context[] $contexts Restrict the analysis to these contexts. No context restrictions if null.
79
     * @return null
80
     */
81
    public function run(array $contexts = []) {
82
 
83
        $options = $this->analyser->get_options();
84
 
85
        // Time limit control.
86
        $modeltimelimit = intval(get_config('analytics', 'modeltimelimit'));
87
 
88
        if ($this->includetarget) {
89
            $action = 'training';
90
        } else {
91
            $action = 'prediction';
92
        }
93
        $analysables = $this->analyser->get_analysables_iterator($action, $contexts);
94
 
95
        $processedanalysables = $this->get_processed_analysables();
96
 
97
        $inittime = microtime(true);
98
        foreach ($analysables as $analysable) {
99
            $processed = false;
100
 
101
            if (!$analysable) {
102
                continue;
103
            }
104
 
105
            $analysableresults = $this->process_analysable($analysable);
106
            if ($analysableresults) {
107
                $processed = $this->result->add_analysable_results($analysableresults);
108
                if (!$processed) {
109
                    $errors = array();
110
                    foreach ($analysableresults as $timesplittingid => $result) {
111
                        $str = '';
112
                        if (count($analysableresults) > 1) {
113
                            $str .= $timesplittingid . ': ';
114
                        }
115
                        $str .= $result->message;
116
                        $errors[] = $str;
117
                    }
118
 
119
                    $a = new \stdClass();
120
                    $a->analysableid = $analysable->get_name();
121
                    $a->errors = implode(', ', $errors);
122
                    $this->analyser->add_log(get_string('analysablenotused', 'analytics', $a));
123
                }
124
            }
125
 
126
            if (!$options['evaluation']) {
127
 
128
                if (empty($processedanalysables[$analysable->get_id()]) ||
129
                        $this->analyser->get_target()->always_update_analysis_time() || $processed) {
130
                    // We store the list of processed analysables even if the target does not always_update_analysis_time(),
131
                    // what always_update_analysis_time controls is the update of the data.
132
                    $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id());
133
                }
134
 
135
                // Apply time limit.
136
                $timespent = microtime(true) - $inittime;
137
                if ($modeltimelimit <= $timespent) {
138
                    break;
139
                }
140
            }
141
        }
142
 
143
        // Force GC to clean up the indicator instances used during the last iteration.
144
        $this->analyser->instantiate_indicators();
145
    }
146
 
147
    /**
148
     * Get analysables that have been already processed.
149
     *
150
     * @return \stdClass[]
151
     */
152
    protected function get_processed_analysables(): array {
153
        global $DB;
154
 
155
        $params = array('modelid' => $this->analyser->get_modelid());
156
        $params['action'] = ($this->includetarget) ? 'training' : 'prediction';
157
        $select = 'modelid = :modelid and action = :action';
158
 
159
        // Weird select fields ordering for performance (analysableid key matching, analysableid is also unique by modelid).
160
        return $DB->get_records_select('analytics_used_analysables', $select,
161
            $params, 'timeanalysed DESC', 'analysableid, modelid, action, firstanalysis, timeanalysed, id AS primarykey');
162
    }
163
 
164
    /**
165
     * Processes an analysable
166
     *
167
     * This method returns the general analysable status, an array of files by time splitting method and
168
     * an error message if there is any problem.
169
     *
170
     * @param \core_analytics\analysable $analysable
171
     * @return \stdClass[] Results objects by time splitting method
172
     */
173
    public function process_analysable(\core_analytics\analysable $analysable): array {
174
 
175
        // Target instances scope is per-analysable (it can't be lower as calculations run once per
176
        // analysable, not time splitting method nor time range).
177
        $target = call_user_func(array($this->analyser->get_target(), 'instance'));
178
 
179
        // We need to check that the analysable is valid for the target even if we don't include targets
180
        // as we still need to discard invalid analysables for the target.
181
        $isvalidresult = $target->is_valid_analysable($analysable, $this->includetarget);
182
        if ($isvalidresult !== true) {
183
            $a = new \stdClass();
184
            $a->analysableid = $analysable->get_name();
185
            $a->result = $isvalidresult;
186
            $this->analyser->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a));
187
            return array();
188
        }
189
 
190
        // Process all provided time splitting methods.
191
        $results = array();
192
        foreach ($this->analyser->get_timesplittings() as $timesplitting) {
193
 
194
            $cachedresult = $this->result->retrieve_cached_result($timesplitting, $analysable);
195
            if ($cachedresult) {
196
                $result = new \stdClass();
197
                $result->result = $cachedresult;
198
                $results[$timesplitting->get_id()] = $result;
199
                continue;
200
            }
201
 
202
            $results[$timesplitting->get_id()] = $this->process_time_splitting($timesplitting, $analysable, $target);
203
        }
204
 
205
        return $results;
206
    }
207
 
208
    /**
209
     * Processes the analysable samples using the provided time splitting method.
210
     *
211
     * @param \core_analytics\local\time_splitting\base $timesplitting
212
     * @param \core_analytics\analysable $analysable
213
     * @param \core_analytics\local\target\base $target
214
     * @return \stdClass Results object.
215
     */
216
    protected function process_time_splitting(\core_analytics\local\time_splitting\base $timesplitting,
217
            \core_analytics\analysable $analysable, \core_analytics\local\target\base $target): \stdClass {
218
 
219
        $options = $this->analyser->get_options();
220
 
221
        $result = new \stdClass();
222
 
223
        $timesplitting->set_modelid($this->analyser->get_modelid());
224
        if (!$timesplitting->is_valid_analysable($analysable)) {
225
            $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
226
            $result->message = get_string('invalidanalysablefortimesplitting', 'analytics',
227
                $timesplitting->get_name());
228
            return $result;
229
        }
230
        $timesplitting->set_analysable($analysable);
231
 
232
        if (CLI_SCRIPT && !PHPUNIT_TEST) {
233
            mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() .
234
                '" time splitting method...');
235
        }
236
 
237
        // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question
238
        // attempt... it is on what we will base indicators calculations.
239
        list($sampleids, $samplesdata) = $this->analyser->get_all_samples($analysable);
240
 
241
        if (count($sampleids) === 0) {
242
            $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
243
            $result->message = get_string('nodata', 'analytics');
244
            return $result;
245
        }
246
 
247
        if ($this->includetarget) {
248
            // All ranges are used when we are calculating data for training.
249
            $ranges = $timesplitting->get_training_ranges();
250
        } else {
251
            // The latest range that has not yet been used for prediction (it depends on the time range where we are right now).
252
            $ranges = $timesplitting->get_most_recent_prediction_range();
253
        }
254
 
255
        // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
256
        if ($options['evaluation'] === false) {
257
 
258
            if (empty($ranges)) {
259
                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
260
                $result->message = get_string('noranges', 'analytics');
261
                return $result;
262
            }
263
 
264
            // We skip all samples that are already part of a training dataset, even if they have not been used for prediction.
265
            if (!$target::based_on_assumptions()) {
266
                // Targets based on assumptions can not be trained.
267
                $this->filter_out_train_samples($sampleids, $timesplitting);
268
            }
269
 
270
            if (count($sampleids) === 0) {
271
                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
272
                $result->message = get_string('nonewdata', 'analytics');
273
                return $result;
274
            }
275
 
276
            // Only when processing data for predictions.
277
            if (!$this->includetarget) {
278
                // We also filter out samples and ranges that have already been used for predictions.
279
                $predictsamplesrecord = $this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting);
280
            }
281
 
282
            if (count($sampleids) === 0) {
283
                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
284
                $result->message = get_string('nonewdata', 'analytics');
285
                return $result;
286
            }
287
 
288
            if (count($ranges) === 0) {
289
                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
290
                $result->message = get_string('nonewranges', 'analytics');
291
                return $result;
292
            }
293
        }
294
 
295
        // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions).
296
        if (!$this->init_analysable_analysis($timesplitting->get_id(), $analysable->get_id())) {
297
            // If this model + analysable + timesplitting combination is being analysed we skip this process.
298
            $result->status = \core_analytics\model::NO_DATASET;
299
            $result->message = get_string('analysisinprogress', 'analytics');
300
            return $result;
301
        }
302
 
303
        // Remove samples the target consider invalid.
304
        try {
305
            $target->add_sample_data($samplesdata);
306
            $target->filter_out_invalid_samples($sampleids, $analysable, $this->includetarget);
307
        } catch (\Throwable $e) {
308
            $this->finish_analysable_analysis();
309
            throw $e;
310
        }
311
 
312
        if (!$sampleids) {
313
            $result->status = \core_analytics\model::NO_DATASET;
314
            $result->message = get_string('novalidsamples', 'analytics');
315
            $this->finish_analysable_analysis();
316
            return $result;
317
        }
318
 
319
        try {
320
            // Instantiate empty indicators to ensure that no garbage is dragged from previous analyses.
321
            $indicators = $this->analyser->instantiate_indicators();
322
            foreach ($indicators as $key => $indicator) {
323
                // The analyser attaches the main entities the sample depends on and are provided to the
324
                // indicator to calculate the sample.
325
                $indicators[$key]->add_sample_data($samplesdata);
326
            }
327
 
328
            // Here we start the memory intensive process that will last until $data var is
329
            // unset (until the method is finished basically).
330
            $data = $this->calculate($timesplitting, $sampleids, $ranges, $target);
331
        } catch (\Throwable $e) {
332
            $this->finish_analysable_analysis();
333
            throw $e;
334
        }
335
 
336
        if (!$data) {
337
            $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
338
            $result->message = get_string('novaliddata', 'analytics');
339
            $this->finish_analysable_analysis();
340
            return $result;
341
        }
342
 
343
        try {
344
            // No need to keep track of analysed stuff when evaluating.
345
            if ($options['evaluation'] === false) {
346
                // Save the samples that have been already analysed so they are not analysed again in future.
347
 
348
                if ($this->includetarget) {
349
                    $this->save_train_samples($sampleids, $timesplitting);
350
                } else {
351
                    // The variable $predictsamplesrecord will always be set as filter_out_prediction_samples_and_ranges
352
                    // will always be called before it (no evaluation mode and no includetarget).
353
                    $this->save_prediction_samples($sampleids, $ranges, $timesplitting, $predictsamplesrecord);
354
                }
355
            }
356
 
357
            // We need to pass all the analysis data.
358
            $formattedresult = $this->result->format_result($data, $target, $timesplitting, $analysable);
359
 
360
        } catch (\Throwable $e) {
361
            $this->finish_analysable_analysis();
362
            throw $e;
363
        }
364
 
365
        if (!$formattedresult) {
366
            $this->finish_analysable_analysis();
367
            throw new \moodle_exception('errorcannotwritedataset', 'analytics');
368
        }
369
 
370
        $result->status = \core_analytics\model::OK;
371
        $result->message = get_string('successfullyanalysed', 'analytics');
372
        $result->result = $formattedresult;
373
 
374
        // Flag the model + analysable + timesplitting as analysed.
375
        $this->finish_analysable_analysis();
376
 
377
        return $result;
378
    }
379
 
380
    /**
381
     * Calculates indicators and targets.
382
     *
383
     * @param \core_analytics\local\time_splitting\base $timesplitting
384
     * @param array $sampleids
385
     * @param array $ranges
386
     * @param \core_analytics\local\target\base $target
387
     * @return array|null
388
     */
389
    public function calculate(\core_analytics\local\time_splitting\base $timesplitting, array &$sampleids,
390
            array $ranges, \core_analytics\local\target\base $target): ?array {
391
 
392
        $calculatedtarget = null;
393
        if ($this->includetarget) {
394
            // We first calculate the target because analysable data may still be invalid or none
395
            // of the analysable samples may be valid.
396
            $calculatedtarget = $target->calculate($sampleids, $timesplitting->get_analysable());
397
 
398
            // We remove samples we can not calculate their target.
399
            $sampleids = array_filter($sampleids, function($sampleid) use ($calculatedtarget) {
400
                if (is_null($calculatedtarget[$sampleid])) {
401
                    return false;
402
                }
403
                return true;
404
            });
405
        }
406
 
407
        // No need to continue calculating if the target couldn't be calculated for any sample.
408
        if (empty($sampleids)) {
409
            return null;
410
        }
411
 
412
        $dataset = $this->calculate_indicators($timesplitting, $sampleids, $ranges);
413
 
414
        if (empty($dataset)) {
415
            return null;
416
        }
417
 
418
        // Now that we have the indicators in place we can add the time range indicators (and target if provided) to each of them.
419
        $this->fill_dataset($timesplitting, $dataset, $calculatedtarget);
420
 
421
        $this->add_context_metadata($timesplitting, $dataset, $target);
422
 
423
        if (!PHPUNIT_TEST && CLI_SCRIPT) {
424
            echo PHP_EOL;
425
        }
426
 
427
        return $dataset;
428
    }
429
 
430
    /**
431
     * Calculates indicators.
432
     *
433
     * @param \core_analytics\local\time_splitting\base $timesplitting
434
     * @param array $sampleids
435
     * @param array $ranges
436
     * @return array
437
     */
438
    protected function calculate_indicators(\core_analytics\local\time_splitting\base $timesplitting, array $sampleids,
439
            array $ranges): array {
440
        global $DB;
441
 
442
        $options = $this->analyser->get_options();
443
 
444
        $dataset = array();
445
 
446
        // Faster to run 1 db query per range.
447
        $existingcalculations = array();
448
        if ($timesplitting->cache_indicator_calculations()) {
449
            foreach ($ranges as $rangeindex => $range) {
450
                // Load existing calculations.
451
                $existingcalculations[$rangeindex] = \core_analytics\manager::get_indicator_calculations(
452
                    $timesplitting->get_analysable(), $range['start'], $range['end'], $this->analyser->get_samples_origin());
453
            }
454
        }
455
 
456
        // Here we store samples which calculations are not all null.
457
        $notnulls = array();
458
 
459
        // Fill the dataset samples with indicators data.
460
        $newcalculations = array();
461
        foreach ($this->analyser->get_indicators() as $indicator) {
462
 
463
            // Hook to allow indicators to store analysable-dependant data.
464
            $indicator->fill_per_analysable_caches($timesplitting->get_analysable());
465
 
466
            // Per-range calculations.
467
            foreach ($ranges as $rangeindex => $range) {
468
 
469
                // Indicator instances are per-range.
470
                $rangeindicator = clone $indicator;
471
 
472
                $prevcalculations = array();
473
                if (!empty($existingcalculations[$rangeindex][$rangeindicator->get_id()])) {
474
                    $prevcalculations = $existingcalculations[$rangeindex][$rangeindicator->get_id()];
475
                }
476
 
477
                // Calculate the indicator for each sample in this time range.
478
                list($samplesfeatures, $newindicatorcalculations, $indicatornotnulls) = $rangeindicator->calculate($sampleids,
479
                    $this->analyser->get_samples_origin(), $range['start'], $range['end'], $prevcalculations);
480
 
481
                // Associate the extra data generated by the indicator to this range index.
482
                $rangeindicator->save_calculation_info($timesplitting, $rangeindex);
483
 
484
                // Free memory ASAP.
485
                unset($rangeindicator);
486
                gc_collect_cycles();
487
                gc_mem_caches();
488
 
489
                // Copy the features data to the dataset.
490
                foreach ($samplesfeatures as $analysersampleid => $features) {
491
 
492
                    $uniquesampleid = $timesplitting->append_rangeindex($analysersampleid, $rangeindex);
493
 
494
                    if (!isset($notnulls[$uniquesampleid]) && !empty($indicatornotnulls[$analysersampleid])) {
495
                        $notnulls[$uniquesampleid] = $uniquesampleid;
496
                    }
497
 
498
                    // Init the sample if it is still empty.
499
                    if (!isset($dataset[$uniquesampleid])) {
500
                        $dataset[$uniquesampleid] = array();
501
                    }
502
 
503
                    // Append the features indicator features at the end of the sample.
504
                    $dataset[$uniquesampleid] = array_merge($dataset[$uniquesampleid], $features);
505
                }
506
 
507
                if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) {
508
                    $timecreated = time();
509
                    foreach ($newindicatorcalculations as $sampleid => $calculatedvalue) {
510
                        // Prepare the new calculations to be stored into DB.
511
 
512
                        $indcalc = new \stdClass();
513
                        $indcalc->contextid = $timesplitting->get_analysable()->get_context()->id;
514
                        $indcalc->starttime = $range['start'];
515
                        $indcalc->endtime = $range['end'];
516
                        $indcalc->sampleid = $sampleid;
517
                        $indcalc->sampleorigin = $this->analyser->get_samples_origin();
518
                        $indcalc->indicator = $indicator->get_id();
519
                        $indcalc->value = $calculatedvalue;
520
                        $indcalc->timecreated = $timecreated;
521
                        $newcalculations[] = $indcalc;
522
                    }
523
                }
524
            }
525
 
526
            if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) {
527
                $batchsize = self::get_insert_batch_size();
528
                if (count($newcalculations) > $batchsize) {
529
                    // We don't want newcalculations array to grow too much as we already keep the
530
                    // system memory busy storing $dataset contents.
531
 
532
                    // Insert from the beginning.
533
                    $remaining = array_splice($newcalculations, $batchsize);
534
 
535
                    // Sorry mssql and oracle, this will be slow.
536
                    $DB->insert_records('analytics_indicator_calc', $newcalculations);
537
                    $newcalculations = $remaining;
538
                }
539
            }
540
        }
541
 
542
        if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations() && $newcalculations) {
543
            // Insert the remaining records.
544
            $DB->insert_records('analytics_indicator_calc', $newcalculations);
545
        }
546
 
547
        // Delete rows where all calculations are null.
548
        // We still store the indicator calculation and we still store the sample id as
549
        // processed so we don't have to process this sample again, but we exclude it
550
        // from the dataset because it is not useful.
551
        $nulls = array_diff_key($dataset, $notnulls);
552
        foreach ($nulls as $uniqueid => $ignoredvalues) {
553
            unset($dataset[$uniqueid]);
554
        }
555
 
556
        return $dataset;
557
    }
558
 
559
    /**
560
     * Adds time range indicators and the target to each sample.
561
     *
562
     * This will identify the sample as belonging to a specific range.
563
     *
564
     * @param \core_analytics\local\time_splitting\base $timesplitting
565
     * @param array $dataset
566
     * @param array|null $calculatedtarget
567
     * @return null
568
     */
569
    protected function fill_dataset(\core_analytics\local\time_splitting\base $timesplitting,
570
            array &$dataset, ?array $calculatedtarget = null) {
571
 
572
        $nranges = count($timesplitting->get_distinct_ranges());
573
 
574
        foreach ($dataset as $uniquesampleid => $unmodified) {
575
 
576
            list($analysersampleid, $rangeindex) = $timesplitting->infer_sample_info($uniquesampleid);
577
 
578
            // No need to add range features if this time splitting method only defines one time range.
579
            if ($nranges > 1) {
580
 
581
                // 1 column for each range.
582
                $timeindicators = array_fill(0, $nranges, 0);
583
 
584
                $timeindicators[$rangeindex] = 1;
585
 
586
                $dataset[$uniquesampleid] = array_merge($timeindicators, $dataset[$uniquesampleid]);
587
            }
588
 
589
            if ($calculatedtarget) {
590
                // Add this sampleid's calculated target and the end.
591
                $dataset[$uniquesampleid][] = $calculatedtarget[$analysersampleid];
592
 
593
            } else {
594
                // Add this sampleid, it will be used to identify the prediction that comes back from
595
                // the predictions processor.
596
                array_unshift($dataset[$uniquesampleid], $uniquesampleid);
597
            }
598
        }
599
    }
600
 
601
    /**
602
     * Updates the analysable analysis time.
603
     *
604
     * @param array $processedanalysables
605
     * @param int $analysableid
606
     * @return null
607
     */
608
    protected function update_analysable_analysed_time(array $processedanalysables, int $analysableid) {
609
        global $DB;
610
 
611
        $now = time();
612
 
613
        if (!empty($processedanalysables[$analysableid])) {
614
            $obj = $processedanalysables[$analysableid];
615
 
616
            $obj->id = $obj->primarykey;
617
            unset($obj->primarykey);
618
 
619
            $obj->timeanalysed = $now;
620
 
621
            $DB->update_record('analytics_used_analysables', $obj);
622
 
623
        } else {
624
 
625
            $obj = new \stdClass();
626
            $obj->modelid = $this->analyser->get_modelid();
627
            $obj->action = ($this->includetarget) ? 'training' : 'prediction';
628
            $obj->analysableid = $analysableid;
629
            $obj->firstanalysis = $now;
630
            $obj->timeanalysed = $now;
631
 
632
            $obj->primarykey = $DB->insert_record('analytics_used_analysables', $obj);
633
 
634
            // Update the cache just in case it is used in the same request.
635
            $key = $this->analyser->get_modelid() . '_' . $analysableid;
636
            $cache = \cache::make('core', 'modelfirstanalyses');
637
            $cache->set($key, $now);
638
        }
639
    }
640
 
641
    /**
642
     * Fills a cache containing the first time each analysable in the provided model was analysed.
643
     *
644
     * @param int $modelid
645
     * @param int|null $analysableid
646
     * @return null
647
     */
648
    public static function fill_firstanalyses_cache(int $modelid, ?int $analysableid = null) {
649
        global $DB;
650
 
651
        // Using composed keys instead of cache $identifiers because of MDL-65358.
652
        $primarykey = $DB->sql_concat($modelid, "'_'", 'analysableid');
653
        $sql = "SELECT $primarykey AS id, MIN(firstanalysis) AS firstanalysis
654
                  FROM {analytics_used_analysables} aua
655
                 WHERE modelid = :modelid";
656
        $params = ['modelid' => $modelid];
657
 
658
        if ($analysableid) {
659
            $sql .= " AND analysableid = :analysableid";
660
            $params['analysableid'] = $analysableid;
661
        }
662
 
663
        $sql .= " GROUP BY modelid, analysableid ORDER BY analysableid";
664
 
665
        $firstanalyses = $DB->get_records_sql($sql, $params);
666
        if ($firstanalyses) {
667
            $cache = \cache::make('core', 'modelfirstanalyses');
668
 
669
            $firstanalyses = array_map(function($record) {
670
                return $record->firstanalysis;
671
            }, $firstanalyses);
672
 
673
            $cache->set_many($firstanalyses);
674
        }
675
 
676
        return $firstanalyses;
677
    }
678
 
679
    /**
680
     * Adds dataset context info.
681
     *
682
     * The final dataset document will look like this:
683
     * ----------------------------------------------------
684
     * metadata1,metadata2,metadata3,.....
685
     * value1, value2, value3,.....
686
     *
687
     * header1,header2,header3,header4,.....
688
     * stud1value1,stud1value2,stud1value3,stud1value4,.....
689
     * stud2value1,stud2value2,stud2value3,stud2value4,.....
690
     * .....
691
     * ----------------------------------------------------
692
     *
693
     * @param \core_analytics\local\time_splitting\base $timesplitting
694
     * @param array $dataset
695
     * @param \core_analytics\local\target\base $target
696
     * @return null
697
     */
698
    protected function add_context_metadata(\core_analytics\local\time_splitting\base $timesplitting, array &$dataset,
699
            \core_analytics\local\target\base $target) {
700
        $headers = $this->get_headers($timesplitting, $target);
701
 
702
        // This will also reset samples' dataset keys.
703
        array_unshift($dataset, $headers);
704
    }
705
 
706
    /**
707
     * Returns the headers for the csv file based on the indicators and the target.
708
     *
709
     * @param \core_analytics\local\time_splitting\base $timesplitting
710
     * @param \core_analytics\local\target\base $target
711
     * @return string[]
712
     */
713
    public function get_headers(\core_analytics\local\time_splitting\base $timesplitting,
714
            \core_analytics\local\target\base $target): array {
715
        // 3rd column will contain the indicator ids.
716
        $headers = array();
717
 
718
        if (!$this->includetarget) {
719
            // The first column is the sampleid.
720
            $headers[] = 'sampleid';
721
        }
722
 
723
        // We always have 1 column for each time splitting method range, it does not depend on how
724
        // many ranges we calculated.
725
        $ranges = $timesplitting->get_distinct_ranges();
726
        if (count($ranges) > 1) {
727
            foreach ($ranges as $rangeindex) {
728
                $headers[] = 'range/' . $rangeindex;
729
            }
730
        }
731
 
732
        // Model indicators.
733
        foreach ($this->analyser->get_indicators() as $indicator) {
734
            $headers = array_merge($headers, $indicator::get_feature_headers());
735
        }
736
 
737
        // The target as well.
738
        if ($this->includetarget) {
739
            $headers[] = $target->get_id();
740
        }
741
 
742
        return $headers;
743
    }
744
 
745
    /**
746
     * Filters out samples that have already been used for training.
747
     *
748
     * @param int[] $sampleids
749
     * @param \core_analytics\local\time_splitting\base $timesplitting
750
     * @return  null
751
     */
752
    protected function filter_out_train_samples(array &$sampleids, \core_analytics\local\time_splitting\base $timesplitting) {
753
        global $DB;
754
 
755
        $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(),
756
            'timesplitting' => $timesplitting->get_id());
757
 
758
        $trainingsamples = $DB->get_records('analytics_train_samples', $params);
759
 
760
        // Skip each file trained samples.
761
        foreach ($trainingsamples as $trainingfile) {
762
 
763
            $usedsamples = json_decode($trainingfile->sampleids, true);
764
 
765
            if (!empty($usedsamples)) {
766
                // Reset $sampleids to $sampleids minus this file's $usedsamples.
767
                $sampleids = array_diff_key($sampleids, $usedsamples);
768
            }
769
        }
770
    }
771
 
772
    /**
773
     * Filters out samples that have already been used for prediction.
774
     *
775
     * @param int[] $sampleids
776
     * @param array $ranges
777
     * @param \core_analytics\local\time_splitting\base $timesplitting
778
     * @return  \stdClass|null The analytics_predict_samples record or null
779
     */
780
    protected function filter_out_prediction_samples_and_ranges(array &$sampleids, array &$ranges,
781
            \core_analytics\local\time_splitting\base $timesplitting) {
782
 
783
        if (count($ranges) > 1) {
784
            throw new \coding_exception('$ranges argument should only contain one range');
785
        }
786
 
787
        $rangeindex = key($ranges);
788
        $predictedrange = $this->get_predict_samples_record($timesplitting, $rangeindex);
789
 
790
        if (!$predictedrange) {
791
            // Nothing to filter out.
792
            return null;
793
        }
794
 
795
        $predictedrange->sampleids = json_decode($predictedrange->sampleids, true);
796
        $missingsamples = array_diff_key($sampleids, $predictedrange->sampleids);
797
        if (count($missingsamples) === 0) {
798
            // All samples already calculated.
799
            unset($ranges[$rangeindex]);
800
            return null;
801
        }
802
 
803
        // Replace the list of samples by the one excluding samples that already got predictions at this range.
804
        $sampleids = $missingsamples;
805
 
806
        return $predictedrange;
807
    }
808
 
809
    /**
810
     * Returns a predict samples record.
811
     *
812
     * @param  \core_analytics\local\time_splitting\base $timesplitting
813
     * @param  int                                       $rangeindex
814
     * @return \stdClass|false
815
     */
816
    private function get_predict_samples_record(\core_analytics\local\time_splitting\base $timesplitting, int $rangeindex) {
817
        global $DB;
818
 
819
        $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(),
820
            'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
821
        $predictedrange = $DB->get_record('analytics_predict_samples', $params);
822
 
823
        return $predictedrange;
824
    }
825
 
826
    /**
827
     * Saves samples that have just been used for training.
828
     *
829
     * @param int[] $sampleids
830
     * @param \core_analytics\local\time_splitting\base $timesplitting
831
     * @return null
832
     */
833
    protected function save_train_samples(array $sampleids, \core_analytics\local\time_splitting\base $timesplitting) {
834
        global $DB;
835
 
836
        $trainingsamples = new \stdClass();
837
        $trainingsamples->modelid = $this->analyser->get_modelid();
838
        $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id();
839
        $trainingsamples->timesplitting = $timesplitting->get_id();
840
 
841
        $trainingsamples->sampleids = json_encode($sampleids);
842
        $trainingsamples->timecreated = time();
843
 
844
        $DB->insert_record('analytics_train_samples', $trainingsamples);
845
    }
846
 
847
    /**
848
     * Saves samples that have just been used for prediction.
849
     *
850
     * @param int[] $sampleids
851
     * @param array $ranges
852
     * @param \core_analytics\local\time_splitting\base $timesplitting
853
     * @param \stdClass|null $predictsamplesrecord The existing record or null if there is no record yet.
854
     * @return null
855
     */
856
    protected function save_prediction_samples(array $sampleids, array $ranges,
857
            \core_analytics\local\time_splitting\base $timesplitting, ?\stdClass $predictsamplesrecord = null) {
858
        global $DB;
859
 
860
        if (count($ranges) > 1) {
861
            throw new \coding_exception('$ranges argument should only contain one range');
862
        }
863
 
864
        $rangeindex = key($ranges);
865
 
866
        if ($predictsamplesrecord) {
867
            // Append the new samples used for prediction.
868
            $predictsamplesrecord->sampleids = json_encode($predictsamplesrecord->sampleids + $sampleids);
869
            $predictsamplesrecord->timemodified = time();
870
            $DB->update_record('analytics_predict_samples', $predictsamplesrecord);
871
        } else {
872
            $predictsamplesrecord = (object)[
873
                'modelid' => $this->analyser->get_modelid(),
874
                'analysableid' => $timesplitting->get_analysable()->get_id(),
875
                'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex
876
            ];
877
            $predictsamplesrecord->sampleids = json_encode($sampleids);
878
            $predictsamplesrecord->timecreated = time();
879
            $predictsamplesrecord->timemodified = $predictsamplesrecord->timecreated;
880
            $DB->insert_record('analytics_predict_samples', $predictsamplesrecord);
881
        }
882
    }
883
 
884
    /**
885
     * Flags the analysable element as in-analysis and stores a lock for it.
886
     *
887
     * @param  string $timesplittingid
888
     * @param  int    $analysableid
889
     * @return bool Success or not
890
     */
891
    private function init_analysable_analysis(string $timesplittingid, int $analysableid) {
892
 
893
        // Do not include $this->includetarget as we don't want the same analysable to be analysed for training
894
        // and prediction at the same time.
895
        $lockkey = 'modelid:' . $this->analyser->get_modelid() . '-analysableid:' . $analysableid .
896
            '-timesplitting:' . self::clean_time_splitting_id($timesplittingid);
897
 
898
        // Large timeout as processes may be quite long.
899
        $lockfactory = \core\lock\lock_config::get_lock_factory('core_analytics');
900
 
901
        // If it is not ready in 10 secs skip this model + analysable + timesplittingmethod combination
902
        // it will attempt it again during next cron run.
903
        if (!$this->lock = $lockfactory->get_lock($lockkey, 10)) {
904
            return false;
905
        }
906
        return true;
907
    }
908
 
909
 
910
    /**
911
     * Remove all possibly problematic chars from the time splitting method id (id = its full class name).
912
     *
913
     * @param string $timesplittingid
914
     * @return string
915
     */
916
    public static function clean_time_splitting_id($timesplittingid) {
917
        $timesplittingid = str_replace('\\', '-', $timesplittingid);
918
        return clean_param($timesplittingid, PARAM_ALPHANUMEXT);
919
    }
920
 
921
    /**
922
     * Mark the currently analysed analysable+timesplitting as analysed.
923
     *
924
     * @return null
925
     */
926
    private function finish_analysable_analysis() {
927
        $this->lock->release();
928
    }
929
 
930
    /**
931
     * Returns the batch size used for insert_records.
932
     *
933
     * This method tries to find the best batch size without getting
934
     * into dml internals. Maximum 1000 records to save memory.
935
     *
936
     * @return int
937
     */
938
    private static function get_insert_batch_size(): int {
939
        global $DB;
940
 
941
        $dbconfig = $DB->export_dbconfig();
942
 
943
        // 500 is pgsql default so using 1000 is fine, no other db driver uses a hardcoded value.
944
        if (empty($dbconfig) || empty($dbconfig->dboptions) || empty($dbconfig->dboptions['bulkinsertsize'])) {
945
            return 1000;
946
        }
947
 
948
        $bulkinsert = $dbconfig->dboptions['bulkinsertsize'];
949
        if ($bulkinsert < 1000) {
950
            return $bulkinsert;
951
        }
952
 
953
        while ($bulkinsert > 1000) {
954
            $bulkinsert = round($bulkinsert / 2, 0);
955
        }
956
 
957
        return (int)$bulkinsert;
958
    }
959
}