changes 0230092025
This commit is contained in:
@@ -5,15 +5,79 @@
|
||||
class CsvImportService
|
||||
{
|
||||
/**
|
||||
* Read the first line of a file; returns null on failure.
|
||||
* Normalize a line to UTF-8 and strip BOM / control characters for robust splitting.
|
||||
*/
|
||||
public function readFirstLine(string $path): ?string
|
||||
private function normalizeLine(string $line): string
|
||||
{
|
||||
// Strip UTF-8 BOM
|
||||
if (str_starts_with($line, "\xEF\xBB\xBF")) {
|
||||
$line = substr($line, 3);
|
||||
}
|
||||
// Detect UTF-16 BOMs
|
||||
$hasNulls = strpos($line, "\x00") !== false;
|
||||
if (str_starts_with($line, "\xFF\xFE")) {
|
||||
// UTF-16LE BOM
|
||||
$line = substr($line, 2);
|
||||
$line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE') : preg_replace('/\x00/', '', $line);
|
||||
} elseif (str_starts_with($line, "\xFE\xFF")) {
|
||||
// UTF-16BE BOM
|
||||
$line = substr($line, 2);
|
||||
$line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE') : preg_replace('/\x00/', '', $line);
|
||||
} elseif ($hasNulls) {
|
||||
// Likely UTF-16 without BOM, try LE then BE
|
||||
if (function_exists('mb_convert_encoding')) {
|
||||
$try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE');
|
||||
if ($try !== false) {
|
||||
$line = $try;
|
||||
} else {
|
||||
$try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE');
|
||||
if ($try !== false) {
|
||||
$line = $try;
|
||||
} else {
|
||||
$line = preg_replace('/\x00/', '', $line);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$line = preg_replace('/\x00/', '', $line);
|
||||
}
|
||||
} else {
|
||||
// Non UTF-16: try detect common encodings and convert to UTF-8 if needed
|
||||
if (function_exists('mb_detect_encoding') && function_exists('mb_convert_encoding')) {
|
||||
// Use default detection order for portability across environments
|
||||
$enc = @mb_detect_encoding($line, null, true);
|
||||
if ($enc && strtoupper($enc) !== 'UTF-8') {
|
||||
$line = @mb_convert_encoding($line, 'UTF-8', $enc) ?: $line;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Replace non-breaking space with regular space
|
||||
$line = str_replace("\xC2\xA0", ' ', $line);
|
||||
|
||||
return $line;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the first meaningful (non-empty after normalization) line of a file; returns null on failure.
|
||||
* Skips BOM-only lines and leading blank lines. Limits scanning to first 50 lines to be safe.
|
||||
*/
|
||||
public function readFirstMeaningfulLine(string $path): ?string
|
||||
{
|
||||
$fh = @fopen($path, 'r');
|
||||
if (!$fh) return null;
|
||||
$line = fgets($fh);
|
||||
if (! $fh) {
|
||||
return null;
|
||||
}
|
||||
$line = null;
|
||||
$limit = 50;
|
||||
while ($limit-- > 0 && ($raw = fgets($fh)) !== false) {
|
||||
$normalized = $this->normalizeLine($raw);
|
||||
if (trim($normalized) !== '') {
|
||||
$line = $normalized;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose($fh);
|
||||
return $line === false ? null : $line;
|
||||
|
||||
return $line;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -24,14 +88,15 @@ public function readFirstLine(string $path): ?string
|
||||
public function detectColumnsFromCsv(string $path, bool $hasHeader): array
|
||||
{
|
||||
// Use actual tab character for TSV; keep other common delimiters
|
||||
$delims = [',',';','|',"\t"];
|
||||
$delims = [',', ';', '|', "\t"];
|
||||
$bestDelim = ',';
|
||||
$bestCols = [];
|
||||
|
||||
$firstLine = $this->readFirstLine($path);
|
||||
$firstLine = $this->readFirstMeaningfulLine($path);
|
||||
if ($firstLine === null) {
|
||||
return [$bestDelim, []];
|
||||
}
|
||||
// Already normalized by readFirstMeaningfulLine
|
||||
|
||||
$maxCount = 0;
|
||||
foreach ($delims as $d) {
|
||||
@@ -44,12 +109,27 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
|
||||
}
|
||||
}
|
||||
|
||||
if (!$hasHeader) {
|
||||
// Fallback: if str_getcsv failed to split but we clearly see delimiters, do a simple explode
|
||||
if ($maxCount <= 1) {
|
||||
foreach (["\t", ';', ',', '|'] as $d) {
|
||||
if (substr_count($firstLine, $d) >= 1) {
|
||||
$parts = explode($d, $firstLine);
|
||||
if (count($parts) > $maxCount) {
|
||||
$bestDelim = $d;
|
||||
$bestCols = $parts;
|
||||
$maxCount = count($parts);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (! $hasHeader) {
|
||||
// return positional indices 0..N-1
|
||||
$cols = [];
|
||||
for ($i = 0; $i < $maxCount; $i++) {
|
||||
$cols[] = (string) $i;
|
||||
}
|
||||
|
||||
return [$bestDelim, $cols];
|
||||
}
|
||||
|
||||
@@ -57,6 +137,7 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
|
||||
$clean = array_map(function ($v) {
|
||||
$v = trim((string) $v);
|
||||
$v = preg_replace('/\s+/', ' ', $v);
|
||||
|
||||
return $v;
|
||||
}, $bestCols);
|
||||
|
||||
@@ -69,16 +150,23 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
|
||||
*/
|
||||
public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHeader): array
|
||||
{
|
||||
$firstLine = $this->readFirstLine($path);
|
||||
$firstLine = $this->readFirstMeaningfulLine($path);
|
||||
if ($firstLine === null) {
|
||||
return [];
|
||||
}
|
||||
// Already normalized by readFirstMeaningfulLine
|
||||
$row = str_getcsv($firstLine, $delimiter);
|
||||
$count = is_array($row) ? count($row) : 0;
|
||||
// Fallback explode if str_getcsv failed to split
|
||||
if ($count <= 1 && substr_count($firstLine, $delimiter) >= 1) {
|
||||
$row = explode($delimiter, $firstLine);
|
||||
$count = count($row);
|
||||
}
|
||||
if ($hasHeader) {
|
||||
return array_map(function ($v) {
|
||||
$v = trim((string) $v);
|
||||
$v = preg_replace('/\s+/', ' ', $v);
|
||||
|
||||
return $v;
|
||||
}, $row ?: []);
|
||||
}
|
||||
@@ -86,6 +174,7 @@ public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHe
|
||||
for ($i = 0; $i < $count; $i++) {
|
||||
$cols[] = (string) $i;
|
||||
}
|
||||
|
||||
return $cols;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user