1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
namespace PhpOffice\PhpSpreadsheet\Reader\Csv;
|
|
|
4 |
|
|
|
5 |
class Delimiter
|
|
|
6 |
{
|
|
|
7 |
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
|
|
|
8 |
|
|
|
9 |
/** @var resource */
|
|
|
10 |
protected $fileHandle;
|
|
|
11 |
|
|
|
12 |
/** @var string */
|
|
|
13 |
protected $escapeCharacter;
|
|
|
14 |
|
|
|
15 |
/** @var string */
|
|
|
16 |
protected $enclosure;
|
|
|
17 |
|
|
|
18 |
/** @var array */
|
|
|
19 |
protected $counts = [];
|
|
|
20 |
|
|
|
21 |
/** @var int */
|
|
|
22 |
protected $numberLines = 0;
|
|
|
23 |
|
|
|
24 |
/** @var ?string */
|
|
|
25 |
protected $delimiter;
|
|
|
26 |
|
|
|
27 |
/**
|
|
|
28 |
* @param resource $fileHandle
|
|
|
29 |
*/
|
|
|
30 |
public function __construct($fileHandle, string $escapeCharacter, string $enclosure)
|
|
|
31 |
{
|
|
|
32 |
$this->fileHandle = $fileHandle;
|
|
|
33 |
$this->escapeCharacter = $escapeCharacter;
|
|
|
34 |
$this->enclosure = $enclosure;
|
|
|
35 |
|
|
|
36 |
$this->countPotentialDelimiters();
|
|
|
37 |
}
|
|
|
38 |
|
|
|
39 |
public function getDefaultDelimiter(): string
|
|
|
40 |
{
|
|
|
41 |
return self::POTENTIAL_DELIMETERS[0];
|
|
|
42 |
}
|
|
|
43 |
|
|
|
44 |
public function linesCounted(): int
|
|
|
45 |
{
|
|
|
46 |
return $this->numberLines;
|
|
|
47 |
}
|
|
|
48 |
|
|
|
49 |
protected function countPotentialDelimiters(): void
|
|
|
50 |
{
|
|
|
51 |
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
|
|
|
52 |
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
|
|
|
53 |
|
|
|
54 |
// Count how many times each of the potential delimiters appears in each line
|
|
|
55 |
$this->numberLines = 0;
|
|
|
56 |
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
|
|
|
57 |
$this->countDelimiterValues($line, $delimiterKeys);
|
|
|
58 |
}
|
|
|
59 |
}
|
|
|
60 |
|
|
|
61 |
protected function countDelimiterValues(string $line, array $delimiterKeys): void
|
|
|
62 |
{
|
|
|
63 |
$splitString = str_split($line, 1);
|
|
|
64 |
if (is_array($splitString)) {
|
|
|
65 |
$distribution = array_count_values($splitString);
|
|
|
66 |
$countLine = array_intersect_key($distribution, $delimiterKeys);
|
|
|
67 |
|
|
|
68 |
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
|
|
|
69 |
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
|
|
|
70 |
}
|
|
|
71 |
}
|
|
|
72 |
}
|
|
|
73 |
|
|
|
74 |
public function infer(): ?string
|
|
|
75 |
{
|
|
|
76 |
// Calculate the mean square deviations for each delimiter
|
|
|
77 |
// (ignoring delimiters that haven't been found consistently)
|
|
|
78 |
$meanSquareDeviations = [];
|
|
|
79 |
$middleIdx = floor(($this->numberLines - 1) / 2);
|
|
|
80 |
|
|
|
81 |
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
|
|
|
82 |
$series = $this->counts[$delimiter];
|
|
|
83 |
sort($series);
|
|
|
84 |
|
|
|
85 |
$median = ($this->numberLines % 2)
|
|
|
86 |
? $series[$middleIdx]
|
|
|
87 |
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
|
|
|
88 |
|
|
|
89 |
if ($median === 0) {
|
|
|
90 |
continue;
|
|
|
91 |
}
|
|
|
92 |
|
|
|
93 |
$meanSquareDeviations[$delimiter] = array_reduce(
|
|
|
94 |
$series,
|
|
|
95 |
function ($sum, $value) use ($median) {
|
|
|
96 |
return $sum + ($value - $median) ** 2;
|
|
|
97 |
}
|
|
|
98 |
) / count($series);
|
|
|
99 |
}
|
|
|
100 |
|
|
|
101 |
// ... and pick the delimiter with the smallest mean square deviation
|
|
|
102 |
// (in case of ties, the order in potentialDelimiters is respected)
|
|
|
103 |
$min = INF;
|
|
|
104 |
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
|
|
|
105 |
if (!isset($meanSquareDeviations[$delimiter])) {
|
|
|
106 |
continue;
|
|
|
107 |
}
|
|
|
108 |
|
|
|
109 |
if ($meanSquareDeviations[$delimiter] < $min) {
|
|
|
110 |
$min = $meanSquareDeviations[$delimiter];
|
|
|
111 |
$this->delimiter = $delimiter;
|
|
|
112 |
}
|
|
|
113 |
}
|
|
|
114 |
|
|
|
115 |
return $this->delimiter;
|
|
|
116 |
}
|
|
|
117 |
|
|
|
118 |
/**
|
|
|
119 |
* Get the next full line from the file.
|
|
|
120 |
*
|
|
|
121 |
* @return false|string
|
|
|
122 |
*/
|
|
|
123 |
public function getNextLine()
|
|
|
124 |
{
|
|
|
125 |
$line = '';
|
|
|
126 |
$enclosure = ($this->escapeCharacter === '' ? ''
|
|
|
127 |
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
|
|
|
128 |
. preg_quote($this->enclosure, '/');
|
|
|
129 |
|
|
|
130 |
do {
|
|
|
131 |
// Get the next line in the file
|
|
|
132 |
$newLine = fgets($this->fileHandle);
|
|
|
133 |
|
|
|
134 |
// Return false if there is no next line
|
|
|
135 |
if ($newLine === false) {
|
|
|
136 |
return false;
|
|
|
137 |
}
|
|
|
138 |
|
|
|
139 |
// Add the new line to the line passed in
|
|
|
140 |
$line = $line . $newLine;
|
|
|
141 |
|
|
|
142 |
// Drop everything that is enclosed to avoid counting false positives in enclosures
|
|
|
143 |
$line = (string) preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
|
|
|
144 |
|
|
|
145 |
// See if we have any enclosures left in the line
|
|
|
146 |
// if we still have an enclosure then we need to read the next line as well
|
|
|
147 |
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);
|
|
|
148 |
|
|
|
149 |
return ($line !== '') ? $line : false;
|
|
|
150 |
}
|
|
|
151 |
}
|