Teren-app/app/Services/CsvImportService.php
2025-09-30 00:06:47 +02:00

181 lines
6.1 KiB
PHP

<?php
namespace App\Services;
class CsvImportService
{
/**
* Normalize a line to UTF-8 and strip BOM / control characters for robust splitting.
*/
private function normalizeLine(string $line): string
{
// Strip UTF-8 BOM
if (str_starts_with($line, "\xEF\xBB\xBF")) {
$line = substr($line, 3);
}
// Detect UTF-16 BOMs
$hasNulls = strpos($line, "\x00") !== false;
if (str_starts_with($line, "\xFF\xFE")) {
// UTF-16LE BOM
$line = substr($line, 2);
$line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE') : preg_replace('/\x00/', '', $line);
} elseif (str_starts_with($line, "\xFE\xFF")) {
// UTF-16BE BOM
$line = substr($line, 2);
$line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE') : preg_replace('/\x00/', '', $line);
} elseif ($hasNulls) {
// Likely UTF-16 without BOM, try LE then BE
if (function_exists('mb_convert_encoding')) {
$try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE');
if ($try !== false) {
$line = $try;
} else {
$try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE');
if ($try !== false) {
$line = $try;
} else {
$line = preg_replace('/\x00/', '', $line);
}
}
} else {
$line = preg_replace('/\x00/', '', $line);
}
} else {
// Non UTF-16: try detect common encodings and convert to UTF-8 if needed
if (function_exists('mb_detect_encoding') && function_exists('mb_convert_encoding')) {
// Use default detection order for portability across environments
$enc = @mb_detect_encoding($line, null, true);
if ($enc && strtoupper($enc) !== 'UTF-8') {
$line = @mb_convert_encoding($line, 'UTF-8', $enc) ?: $line;
}
}
}
// Replace non-breaking space with regular space
$line = str_replace("\xC2\xA0", ' ', $line);
return $line;
}
/**
* Read the first meaningful (non-empty after normalization) line of a file; returns null on failure.
* Skips BOM-only lines and leading blank lines. Limits scanning to first 50 lines to be safe.
*/
public function readFirstMeaningfulLine(string $path): ?string
{
$fh = @fopen($path, 'r');
if (! $fh) {
return null;
}
$line = null;
$limit = 50;
while ($limit-- > 0 && ($raw = fgets($fh)) !== false) {
$normalized = $this->normalizeLine($raw);
if (trim($normalized) !== '') {
$line = $normalized;
break;
}
}
fclose($fh);
return $line;
}
/**
* Detect delimiter and return columns for first row.
* If $hasHeader is false, returns positional indices instead of header names.
* Returns [delimiter, columns].
*/
public function detectColumnsFromCsv(string $path, bool $hasHeader): array
{
// Use actual tab character for TSV; keep other common delimiters
$delims = [',', ';', '|', "\t"];
$bestDelim = ',';
$bestCols = [];
$firstLine = $this->readFirstMeaningfulLine($path);
if ($firstLine === null) {
return [$bestDelim, []];
}
// Already normalized by readFirstMeaningfulLine
$maxCount = 0;
foreach ($delims as $d) {
$row = str_getcsv($firstLine, $d);
$count = is_array($row) ? count($row) : 0;
if ($count > $maxCount) {
$maxCount = $count;
$bestDelim = $d;
$bestCols = $row;
}
}
// Fallback: if str_getcsv failed to split but we clearly see delimiters, do a simple explode
if ($maxCount <= 1) {
foreach (["\t", ';', ',', '|'] as $d) {
if (substr_count($firstLine, $d) >= 1) {
$parts = explode($d, $firstLine);
if (count($parts) > $maxCount) {
$bestDelim = $d;
$bestCols = $parts;
$maxCount = count($parts);
}
}
}
}
if (! $hasHeader) {
// return positional indices 0..N-1
$cols = [];
for ($i = 0; $i < $maxCount; $i++) {
$cols[] = (string) $i;
}
return [$bestDelim, $cols];
}
// Clean header names
$clean = array_map(function ($v) {
$v = trim((string) $v);
$v = preg_replace('/\s+/', ' ', $v);
return $v;
}, $bestCols);
return [$bestDelim, $clean];
}
/**
* Parse columns from CSV using a specific delimiter. If $hasHeader is false,
* returns positional indices instead of header names.
*/
public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHeader): array
{
$firstLine = $this->readFirstMeaningfulLine($path);
if ($firstLine === null) {
return [];
}
// Already normalized by readFirstMeaningfulLine
$row = str_getcsv($firstLine, $delimiter);
$count = is_array($row) ? count($row) : 0;
// Fallback explode if str_getcsv failed to split
if ($count <= 1 && substr_count($firstLine, $delimiter) >= 1) {
$row = explode($delimiter, $firstLine);
$count = count($row);
}
if ($hasHeader) {
return array_map(function ($v) {
$v = trim((string) $v);
$v = preg_replace('/\s+/', ' ', $v);
return $v;
}, $row ?: []);
}
$cols = [];
for ($i = 0; $i < $count; $i++) {
$cols[] = (string) $i;
}
return $cols;
}
}