| 1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
declare(strict_types=1);
|
|
|
4 |
|
|
|
5 |
namespace Phpml\Dataset;
|
|
|
6 |
|
|
|
7 |
use Phpml\Exception\DatasetException;
|
|
|
8 |
use Phpml\Exception\FileException;
|
|
|
9 |
|
|
|
10 |
class SvmDataset extends ArrayDataset
|
|
|
11 |
{
|
|
|
12 |
public function __construct(string $filePath)
|
|
|
13 |
{
|
|
|
14 |
[$samples, $targets] = self::readProblem($filePath);
|
|
|
15 |
|
|
|
16 |
parent::__construct($samples, $targets);
|
|
|
17 |
}
|
|
|
18 |
|
|
|
19 |
private static function readProblem(string $filePath): array
|
|
|
20 |
{
|
|
|
21 |
$handle = self::openFile($filePath);
|
|
|
22 |
|
|
|
23 |
$samples = [];
|
|
|
24 |
$targets = [];
|
|
|
25 |
$maxIndex = 0;
|
|
|
26 |
while (false !== $line = fgets($handle)) {
|
|
|
27 |
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
|
|
|
28 |
$samples[] = $sample;
|
|
|
29 |
$targets[] = $target;
|
|
|
30 |
}
|
|
|
31 |
|
|
|
32 |
fclose($handle);
|
|
|
33 |
|
|
|
34 |
foreach ($samples as &$sample) {
|
|
|
35 |
$sample = array_pad($sample, $maxIndex + 1, 0);
|
|
|
36 |
}
|
|
|
37 |
|
|
|
38 |
return [$samples, $targets];
|
|
|
39 |
}
|
|
|
40 |
|
|
|
41 |
/**
|
|
|
42 |
* @return resource
|
|
|
43 |
*/
|
|
|
44 |
private static function openFile(string $filePath)
|
|
|
45 |
{
|
|
|
46 |
if (!file_exists($filePath)) {
|
|
|
47 |
throw new FileException(sprintf('File "%s" missing.', basename($filePath)));
|
|
|
48 |
}
|
|
|
49 |
|
|
|
50 |
$handle = fopen($filePath, 'rb');
|
|
|
51 |
if ($handle === false) {
|
|
|
52 |
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath)));
|
|
|
53 |
}
|
|
|
54 |
|
|
|
55 |
return $handle;
|
|
|
56 |
}
|
|
|
57 |
|
|
|
58 |
private static function processLine(string $line, int $maxIndex): array
|
|
|
59 |
{
|
|
|
60 |
$columns = self::parseLine($line);
|
|
|
61 |
|
|
|
62 |
$target = self::parseTargetColumn($columns[0]);
|
|
|
63 |
$sample = array_fill(0, $maxIndex + 1, 0);
|
|
|
64 |
|
|
|
65 |
$n = count($columns);
|
|
|
66 |
for ($i = 1; $i < $n; ++$i) {
|
|
|
67 |
[$index, $value] = self::parseFeatureColumn($columns[$i]);
|
|
|
68 |
if ($index > $maxIndex) {
|
|
|
69 |
$maxIndex = $index;
|
|
|
70 |
$sample = array_pad($sample, $maxIndex + 1, 0);
|
|
|
71 |
}
|
|
|
72 |
|
|
|
73 |
$sample[$index] = $value;
|
|
|
74 |
}
|
|
|
75 |
|
|
|
76 |
return [$sample, $target, $maxIndex];
|
|
|
77 |
}
|
|
|
78 |
|
|
|
79 |
private static function parseLine(string $line): array
|
|
|
80 |
{
|
|
|
81 |
$line = explode('#', $line, 2)[0];
|
|
|
82 |
$line = rtrim($line);
|
|
|
83 |
$line = str_replace("\t", ' ', $line);
|
|
|
84 |
|
|
|
85 |
return explode(' ', $line);
|
|
|
86 |
}
|
|
|
87 |
|
|
|
88 |
private static function parseTargetColumn(string $column): float
|
|
|
89 |
{
|
|
|
90 |
if (!is_numeric($column)) {
|
|
|
91 |
throw new DatasetException(sprintf('Invalid target "%s".', $column));
|
|
|
92 |
}
|
|
|
93 |
|
|
|
94 |
return (float) $column;
|
|
|
95 |
}
|
|
|
96 |
|
|
|
97 |
private static function parseFeatureColumn(string $column): array
|
|
|
98 |
{
|
|
|
99 |
$feature = explode(':', $column, 2);
|
|
|
100 |
if (count($feature) !== 2) {
|
|
|
101 |
throw new DatasetException(sprintf('Invalid value "%s".', $column));
|
|
|
102 |
}
|
|
|
103 |
|
|
|
104 |
$index = self::parseFeatureIndex($feature[0]);
|
|
|
105 |
$value = self::parseFeatureValue($feature[1]);
|
|
|
106 |
|
|
|
107 |
return [$index, $value];
|
|
|
108 |
}
|
|
|
109 |
|
|
|
110 |
private static function parseFeatureIndex(string $index): int
|
|
|
111 |
{
|
|
|
112 |
if (!is_numeric($index) || !ctype_digit($index)) {
|
|
|
113 |
throw new DatasetException(sprintf('Invalid index "%s".', $index));
|
|
|
114 |
}
|
|
|
115 |
|
|
|
116 |
if ((int) $index < 1) {
|
|
|
117 |
throw new DatasetException(sprintf('Invalid index "%s".', $index));
|
|
|
118 |
}
|
|
|
119 |
|
|
|
120 |
return (int) $index - 1;
|
|
|
121 |
}
|
|
|
122 |
|
|
|
123 |
private static function parseFeatureValue(string $value): float
|
|
|
124 |
{
|
|
|
125 |
if (!is_numeric($value)) {
|
|
|
126 |
throw new DatasetException(sprintf('Invalid value "%s".', $value));
|
|
|
127 |
}
|
|
|
128 |
|
|
|
129 |
return (float) $value;
|
|
|
130 |
}
|
|
|
131 |
}
|