changes 0230092025

This commit is contained in:
Simon Pocrnjič
2025-09-30 00:06:47 +02:00
parent 1fddf959f0
commit a2bb75fdcc
31 changed files with 2729 additions and 628 deletions
+98 -9
View File
@@ -5,15 +5,79 @@
class CsvImportService
{
/**
* Read the first line of a file; returns null on failure.
* Normalize a line to UTF-8 and strip BOM / control characters for robust splitting.
*/
public function readFirstLine(string $path): ?string
private function normalizeLine(string $line): string
{
// Strip UTF-8 BOM
if (str_starts_with($line, "\xEF\xBB\xBF")) {
$line = substr($line, 3);
}
// Detect UTF-16 BOMs
$hasNulls = strpos($line, "\x00") !== false;
if (str_starts_with($line, "\xFF\xFE")) {
// UTF-16LE BOM
$line = substr($line, 2);
$line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE') : preg_replace('/\x00/', '', $line);
} elseif (str_starts_with($line, "\xFE\xFF")) {
// UTF-16BE BOM
$line = substr($line, 2);
$line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE') : preg_replace('/\x00/', '', $line);
} elseif ($hasNulls) {
// Likely UTF-16 without BOM, try LE then BE
if (function_exists('mb_convert_encoding')) {
$try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE');
if ($try !== false) {
$line = $try;
} else {
$try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE');
if ($try !== false) {
$line = $try;
} else {
$line = preg_replace('/\x00/', '', $line);
}
}
} else {
$line = preg_replace('/\x00/', '', $line);
}
} else {
// Non UTF-16: try detect common encodings and convert to UTF-8 if needed
if (function_exists('mb_detect_encoding') && function_exists('mb_convert_encoding')) {
// Use default detection order for portability across environments
$enc = @mb_detect_encoding($line, null, true);
if ($enc && strtoupper($enc) !== 'UTF-8') {
$line = @mb_convert_encoding($line, 'UTF-8', $enc) ?: $line;
}
}
}
// Replace non-breaking space with regular space
$line = str_replace("\xC2\xA0", ' ', $line);
return $line;
}
/**
* Read the first meaningful (non-empty after normalization) line of a file; returns null on failure.
* Skips BOM-only lines and leading blank lines. Limits scanning to first 50 lines to be safe.
*/
public function readFirstMeaningfulLine(string $path): ?string
{
$fh = @fopen($path, 'r');
if (!$fh) return null;
$line = fgets($fh);
if (! $fh) {
return null;
}
$line = null;
$limit = 50;
while ($limit-- > 0 && ($raw = fgets($fh)) !== false) {
$normalized = $this->normalizeLine($raw);
if (trim($normalized) !== '') {
$line = $normalized;
break;
}
}
fclose($fh);
return $line === false ? null : $line;
return $line;
}
/**
@@ -24,14 +88,15 @@ public function readFirstLine(string $path): ?string
public function detectColumnsFromCsv(string $path, bool $hasHeader): array
{
// Use actual tab character for TSV; keep other common delimiters
$delims = [',',';','|',"\t"];
$delims = [',', ';', '|', "\t"];
$bestDelim = ',';
$bestCols = [];
$firstLine = $this->readFirstLine($path);
$firstLine = $this->readFirstMeaningfulLine($path);
if ($firstLine === null) {
return [$bestDelim, []];
}
// Already normalized by readFirstMeaningfulLine
$maxCount = 0;
foreach ($delims as $d) {
@@ -44,12 +109,27 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
}
}
if (!$hasHeader) {
// Fallback: if str_getcsv failed to split but we clearly see delimiters, do a simple explode
if ($maxCount <= 1) {
foreach (["\t", ';', ',', '|'] as $d) {
if (substr_count($firstLine, $d) >= 1) {
$parts = explode($d, $firstLine);
if (count($parts) > $maxCount) {
$bestDelim = $d;
$bestCols = $parts;
$maxCount = count($parts);
}
}
}
}
if (! $hasHeader) {
// return positional indices 0..N-1
$cols = [];
for ($i = 0; $i < $maxCount; $i++) {
$cols[] = (string) $i;
}
return [$bestDelim, $cols];
}
@@ -57,6 +137,7 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
$clean = array_map(function ($v) {
$v = trim((string) $v);
$v = preg_replace('/\s+/', ' ', $v);
return $v;
}, $bestCols);
@@ -69,16 +150,23 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
*/
public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHeader): array
{
$firstLine = $this->readFirstLine($path);
$firstLine = $this->readFirstMeaningfulLine($path);
if ($firstLine === null) {
return [];
}
// Already normalized by readFirstMeaningfulLine
$row = str_getcsv($firstLine, $delimiter);
$count = is_array($row) ? count($row) : 0;
// Fallback explode if str_getcsv failed to split
if ($count <= 1 && substr_count($firstLine, $delimiter) >= 1) {
$row = explode($delimiter, $firstLine);
$count = count($row);
}
if ($hasHeader) {
return array_map(function ($v) {
$v = trim((string) $v);
$v = preg_replace('/\s+/', ' ', $v);
return $v;
}, $row ?: []);
}
@@ -86,6 +174,7 @@ public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHe
for ($i = 0; $i < $count; $i++) {
$cols[] = (string) $i;
}
return $cols;
}
}