1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
declare(strict_types=1);
|
|
|
4 |
|
|
|
5 |
namespace Phpml\DimensionReduction;
|
|
|
6 |
|
|
|
7 |
use Phpml\Exception\InvalidArgumentException;
|
|
|
8 |
use Phpml\Exception\InvalidOperationException;
|
|
|
9 |
use Phpml\Math\Statistic\Covariance;
|
|
|
10 |
use Phpml\Math\Statistic\Mean;
|
|
|
11 |
|
|
|
12 |
class PCA extends EigenTransformerBase
|
|
|
13 |
{
|
|
|
14 |
/**
|
|
|
15 |
* Temporary storage for mean values for each dimension in given data
|
|
|
16 |
*
|
|
|
17 |
* @var array
|
|
|
18 |
*/
|
|
|
19 |
protected $means = [];
|
|
|
20 |
|
|
|
21 |
/**
|
|
|
22 |
* @var bool
|
|
|
23 |
*/
|
|
|
24 |
protected $fit = false;
|
|
|
25 |
|
|
|
26 |
/**
|
|
|
27 |
* PCA (Principal Component Analysis) used to explain given
|
|
|
28 |
* data with lower number of dimensions. This analysis transforms the
|
|
|
29 |
* data to a lower dimensional version of it by conserving a proportion of total variance
|
|
|
30 |
* within the data. It is a lossy data compression technique.<br>
|
|
|
31 |
*
|
|
|
32 |
* @param float $totalVariance Total explained variance to be preserved
|
|
|
33 |
* @param int $numFeatures Number of features to be preserved
|
|
|
34 |
*
|
|
|
35 |
* @throws InvalidArgumentException
|
|
|
36 |
*/
|
|
|
37 |
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
|
|
|
38 |
{
|
|
|
39 |
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
|
|
|
40 |
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
|
|
|
41 |
}
|
|
|
42 |
|
|
|
43 |
if ($numFeatures !== null && $numFeatures <= 0) {
|
|
|
44 |
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
|
|
|
45 |
}
|
|
|
46 |
|
|
|
47 |
if (($totalVariance !== null) === ($numFeatures !== null)) {
|
|
|
48 |
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
|
|
|
49 |
}
|
|
|
50 |
|
|
|
51 |
if ($numFeatures !== null) {
|
|
|
52 |
$this->numFeatures = $numFeatures;
|
|
|
53 |
}
|
|
|
54 |
|
|
|
55 |
if ($totalVariance !== null) {
|
|
|
56 |
$this->totalVariance = $totalVariance;
|
|
|
57 |
}
|
|
|
58 |
}
|
|
|
59 |
|
|
|
60 |
/**
|
|
|
61 |
* Takes a data and returns a lower dimensional version
|
|
|
62 |
* of this data while preserving $totalVariance or $numFeatures. <br>
|
|
|
63 |
* $data is an n-by-m matrix and returned array is
|
|
|
64 |
* n-by-k matrix where k <= m
|
|
|
65 |
*/
|
|
|
66 |
public function fit(array $data): array
|
|
|
67 |
{
|
|
|
68 |
$n = count($data[0]);
|
|
|
69 |
|
|
|
70 |
$data = $this->normalize($data, $n);
|
|
|
71 |
|
|
|
72 |
$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
|
|
|
73 |
|
|
|
74 |
$this->eigenDecomposition($covMatrix);
|
|
|
75 |
|
|
|
76 |
$this->fit = true;
|
|
|
77 |
|
|
|
78 |
return $this->reduce($data);
|
|
|
79 |
}
|
|
|
80 |
|
|
|
81 |
/**
|
|
|
82 |
* Transforms the given sample to a lower dimensional vector by using
|
|
|
83 |
* the eigenVectors obtained in the last run of <code>fit</code>.
|
|
|
84 |
*
|
|
|
85 |
* @throws InvalidOperationException
|
|
|
86 |
*/
|
|
|
87 |
public function transform(array $sample): array
|
|
|
88 |
{
|
|
|
89 |
if (!$this->fit) {
|
|
|
90 |
throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
|
|
|
91 |
}
|
|
|
92 |
|
|
|
93 |
if (!is_array($sample[0])) {
|
|
|
94 |
$sample = [$sample];
|
|
|
95 |
}
|
|
|
96 |
|
|
|
97 |
$sample = $this->normalize($sample, count($sample[0]));
|
|
|
98 |
|
|
|
99 |
return $this->reduce($sample);
|
|
|
100 |
}
|
|
|
101 |
|
|
|
102 |
protected function calculateMeans(array $data, int $n): void
|
|
|
103 |
{
|
|
|
104 |
// Calculate means for each dimension
|
|
|
105 |
$this->means = [];
|
|
|
106 |
for ($i = 0; $i < $n; ++$i) {
|
|
|
107 |
$column = array_column($data, $i);
|
|
|
108 |
$this->means[] = Mean::arithmetic($column);
|
|
|
109 |
}
|
|
|
110 |
}
|
|
|
111 |
|
|
|
112 |
/**
|
|
|
113 |
* Normalization of the data includes subtracting mean from
|
|
|
114 |
* each dimension therefore dimensions will be centered to zero
|
|
|
115 |
*/
|
|
|
116 |
protected function normalize(array $data, int $n): array
|
|
|
117 |
{
|
|
|
118 |
if (count($this->means) === 0) {
|
|
|
119 |
$this->calculateMeans($data, $n);
|
|
|
120 |
}
|
|
|
121 |
|
|
|
122 |
// Normalize data
|
|
|
123 |
foreach (array_keys($data) as $i) {
|
|
|
124 |
for ($k = 0; $k < $n; ++$k) {
|
|
|
125 |
$data[$i][$k] -= $this->means[$k];
|
|
|
126 |
}
|
|
|
127 |
}
|
|
|
128 |
|
|
|
129 |
return $data;
|
|
|
130 |
}
|
|
|
131 |
}
|