1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
declare(strict_types=1);
|
|
|
4 |
|
|
|
5 |
namespace Phpml\DimensionReduction;
|
|
|
6 |
|
|
|
7 |
use Closure;
|
|
|
8 |
use Phpml\Exception\InvalidArgumentException;
|
|
|
9 |
use Phpml\Exception\InvalidOperationException;
|
|
|
10 |
use Phpml\Math\Distance\Euclidean;
|
|
|
11 |
use Phpml\Math\Distance\Manhattan;
|
|
|
12 |
use Phpml\Math\Matrix;
|
|
|
13 |
|
|
|
14 |
class KernelPCA extends PCA
|
|
|
15 |
{
|
|
|
16 |
public const KERNEL_RBF = 1;
|
|
|
17 |
|
|
|
18 |
public const KERNEL_SIGMOID = 2;
|
|
|
19 |
|
|
|
20 |
public const KERNEL_LAPLACIAN = 3;
|
|
|
21 |
|
|
|
22 |
public const KERNEL_LINEAR = 4;
|
|
|
23 |
|
|
|
24 |
/**
|
|
|
25 |
* Selected kernel function
|
|
|
26 |
*
|
|
|
27 |
* @var int
|
|
|
28 |
*/
|
|
|
29 |
protected $kernel;
|
|
|
30 |
|
|
|
31 |
/**
|
|
|
32 |
* Gamma value used by the kernel
|
|
|
33 |
*
|
|
|
34 |
* @var float|null
|
|
|
35 |
*/
|
|
|
36 |
protected $gamma;
|
|
|
37 |
|
|
|
38 |
/**
|
|
|
39 |
* Original dataset used to fit KernelPCA
|
|
|
40 |
*
|
|
|
41 |
* @var array
|
|
|
42 |
*/
|
|
|
43 |
protected $data = [];
|
|
|
44 |
|
|
|
45 |
/**
|
|
|
46 |
* Kernel principal component analysis (KernelPCA) is an extension of PCA using
|
|
|
47 |
* techniques of kernel methods. It is more suitable for data that involves
|
|
|
48 |
* vectors that are not linearly separable<br><br>
|
|
|
49 |
* Example: <b>$kpca = new KernelPCA(KernelPCA::KERNEL_RBF, null, 2, 15.0);</b>
|
|
|
50 |
* will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br>
|
|
|
51 |
* This transformation will return the same number of rows with only <i>2</i> columns.
|
|
|
52 |
*
|
|
|
53 |
* @param float $totalVariance Total variance to be preserved if numFeatures is not given
|
|
|
54 |
* @param int $numFeatures Number of columns to be returned
|
|
|
55 |
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
|
|
|
56 |
*
|
|
|
57 |
* @throws InvalidArgumentException
|
|
|
58 |
*/
|
|
|
59 |
public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null)
|
|
|
60 |
{
|
|
|
61 |
if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) {
|
|
|
62 |
throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian');
|
|
|
63 |
}
|
|
|
64 |
|
|
|
65 |
parent::__construct($totalVariance, $numFeatures);
|
|
|
66 |
|
|
|
67 |
$this->kernel = $kernel;
|
|
|
68 |
$this->gamma = $gamma;
|
|
|
69 |
}
|
|
|
70 |
|
|
|
71 |
/**
|
|
|
72 |
* Takes a data and returns a lower dimensional version
|
|
|
73 |
* of this data while preserving $totalVariance or $numFeatures. <br>
|
|
|
74 |
* $data is an n-by-m matrix and returned array is
|
|
|
75 |
* n-by-k matrix where k <= m
|
|
|
76 |
*/
|
|
|
77 |
public function fit(array $data): array
|
|
|
78 |
{
|
|
|
79 |
$numRows = count($data);
|
|
|
80 |
$this->data = $data;
|
|
|
81 |
|
|
|
82 |
if ($this->gamma === null) {
|
|
|
83 |
$this->gamma = 1.0 / $numRows;
|
|
|
84 |
}
|
|
|
85 |
|
|
|
86 |
$matrix = $this->calculateKernelMatrix($this->data, $numRows);
|
|
|
87 |
$matrix = $this->centerMatrix($matrix, $numRows);
|
|
|
88 |
|
|
|
89 |
$this->eigenDecomposition($matrix);
|
|
|
90 |
|
|
|
91 |
$this->fit = true;
|
|
|
92 |
|
|
|
93 |
return Matrix::transposeArray($this->eigVectors);
|
|
|
94 |
}
|
|
|
95 |
|
|
|
96 |
/**
|
|
|
97 |
* Transforms the given sample to a lower dimensional vector by using
|
|
|
98 |
* the variables obtained during the last run of <code>fit</code>.
|
|
|
99 |
*
|
|
|
100 |
* @throws InvalidArgumentException
|
|
|
101 |
* @throws InvalidOperationException
|
|
|
102 |
*/
|
|
|
103 |
public function transform(array $sample): array
|
|
|
104 |
{
|
|
|
105 |
if (!$this->fit) {
|
|
|
106 |
throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first');
|
|
|
107 |
}
|
|
|
108 |
|
|
|
109 |
if (is_array($sample[0])) {
|
|
|
110 |
throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays');
|
|
|
111 |
}
|
|
|
112 |
|
|
|
113 |
$pairs = $this->getDistancePairs($sample);
|
|
|
114 |
|
|
|
115 |
return $this->projectSample($pairs);
|
|
|
116 |
}
|
|
|
117 |
|
|
|
118 |
/**
|
|
|
119 |
* Calculates similarity matrix by use of selected kernel function<br>
|
|
|
120 |
* An n-by-m matrix is given and an n-by-n matrix is returned
|
|
|
121 |
*/
|
|
|
122 |
protected function calculateKernelMatrix(array $data, int $numRows): array
|
|
|
123 |
{
|
|
|
124 |
$kernelFunc = $this->getKernel();
|
|
|
125 |
|
|
|
126 |
$matrix = [];
|
|
|
127 |
for ($i = 0; $i < $numRows; ++$i) {
|
|
|
128 |
for ($k = 0; $k < $numRows; ++$k) {
|
|
|
129 |
if ($i <= $k) {
|
|
|
130 |
$matrix[$i][$k] = $kernelFunc($data[$i], $data[$k]);
|
|
|
131 |
} else {
|
|
|
132 |
$matrix[$i][$k] = $matrix[$k][$i];
|
|
|
133 |
}
|
|
|
134 |
}
|
|
|
135 |
}
|
|
|
136 |
|
|
|
137 |
return $matrix;
|
|
|
138 |
}
|
|
|
139 |
|
|
|
140 |
/**
|
|
|
141 |
* Kernel matrix is centered in its original space by using the following
|
|
|
142 |
* conversion:
|
|
|
143 |
*
|
|
|
144 |
* K′ = K − N.K − K.N + N.K.N where N is n-by-n matrix filled with 1/n
|
|
|
145 |
*/
|
|
|
146 |
protected function centerMatrix(array $matrix, int $n): array
|
|
|
147 |
{
|
|
|
148 |
$N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n));
|
|
|
149 |
$N = new Matrix($N, false);
|
|
|
150 |
$K = new Matrix($matrix, false);
|
|
|
151 |
|
|
|
152 |
// K.N (This term is repeated so we cache it once)
|
|
|
153 |
$K_N = $K->multiply($N);
|
|
|
154 |
// N.K
|
|
|
155 |
$N_K = $N->multiply($K);
|
|
|
156 |
// N.K.N
|
|
|
157 |
$N_K_N = $N->multiply($K_N);
|
|
|
158 |
|
|
|
159 |
return $K->subtract($N_K)
|
|
|
160 |
->subtract($K_N)
|
|
|
161 |
->add($N_K_N)
|
|
|
162 |
->toArray();
|
|
|
163 |
}
|
|
|
164 |
|
|
|
165 |
/**
|
|
|
166 |
* Returns the callable kernel function
|
|
|
167 |
*
|
|
|
168 |
* @throws \Exception
|
|
|
169 |
*/
|
|
|
170 |
protected function getKernel(): Closure
|
|
|
171 |
{
|
|
|
172 |
switch ($this->kernel) {
|
|
|
173 |
case self::KERNEL_LINEAR:
|
|
|
174 |
// k(x,y) = xT.y
|
|
|
175 |
return function ($x, $y) {
|
|
|
176 |
return Matrix::dot($x, $y)[0];
|
|
|
177 |
};
|
|
|
178 |
case self::KERNEL_RBF:
|
|
|
179 |
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance
|
|
|
180 |
$dist = new Euclidean();
|
|
|
181 |
|
|
|
182 |
return function ($x, $y) use ($dist): float {
|
|
|
183 |
return exp(-$this->gamma * $dist->sqDistance($x, $y));
|
|
|
184 |
};
|
|
|
185 |
|
|
|
186 |
case self::KERNEL_SIGMOID:
|
|
|
187 |
// k(x,y)=tanh(γ.xT.y+c0) where c0=1
|
|
|
188 |
return function ($x, $y): float {
|
|
|
189 |
$res = Matrix::dot($x, $y)[0] + 1.0;
|
|
|
190 |
|
|
|
191 |
return tanh((float) $this->gamma * $res);
|
|
|
192 |
};
|
|
|
193 |
|
|
|
194 |
case self::KERNEL_LAPLACIAN:
|
|
|
195 |
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance
|
|
|
196 |
$dist = new Manhattan();
|
|
|
197 |
|
|
|
198 |
return function ($x, $y) use ($dist): float {
|
|
|
199 |
return exp(-$this->gamma * $dist->distance($x, $y));
|
|
|
200 |
};
|
|
|
201 |
|
|
|
202 |
default:
|
|
|
203 |
// Not reached
|
|
|
204 |
throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
|
|
|
205 |
}
|
|
|
206 |
}
|
|
|
207 |
|
|
|
208 |
protected function getDistancePairs(array $sample): array
|
|
|
209 |
{
|
|
|
210 |
$kernel = $this->getKernel();
|
|
|
211 |
|
|
|
212 |
$pairs = [];
|
|
|
213 |
foreach ($this->data as $row) {
|
|
|
214 |
$pairs[] = $kernel($row, $sample);
|
|
|
215 |
}
|
|
|
216 |
|
|
|
217 |
return $pairs;
|
|
|
218 |
}
|
|
|
219 |
|
|
|
220 |
protected function projectSample(array $pairs): array
|
|
|
221 |
{
|
|
|
222 |
// Normalize eigenvectors by eig = eigVectors / eigValues
|
|
|
223 |
$func = function ($eigVal, $eigVect) {
|
|
|
224 |
$m = new Matrix($eigVect, false);
|
|
|
225 |
$a = $m->divideByScalar($eigVal)->toArray();
|
|
|
226 |
|
|
|
227 |
return $a[0];
|
|
|
228 |
};
|
|
|
229 |
$eig = array_map($func, $this->eigValues, $this->eigVectors);
|
|
|
230 |
|
|
|
231 |
// return k.dot(eig)
|
|
|
232 |
return Matrix::dot($pairs, $eig);
|
|
|
233 |
}
|
|
|
234 |
}
|