changes 0230092025

2025-09-30 00:06:47 +02:00
parent 1fddf959f0
commit a2bb75fdcc
31 changed files with 2729 additions and 628 deletions
@@ -5,15 +5,79 @@
 class CsvImportService
 {
    /**
-     * Read the first line of a file; returns null on failure.
+     * Normalize a line to UTF-8 and strip BOM / control characters for robust splitting.
     */
-    public function readFirstLine(string $path): ?string
+    private function normalizeLine(string $line): string
+    {
+        // Strip UTF-8 BOM
+        if (str_starts_with($line, "\xEF\xBB\xBF")) {
+            $line = substr($line, 3);
+        }
+        // Detect UTF-16 BOMs
+        $hasNulls = strpos($line, "\x00") !== false;
+        if (str_starts_with($line, "\xFF\xFE")) {
+            // UTF-16LE BOM
+            $line = substr($line, 2);
+            $line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE') : preg_replace('/\x00/', '', $line);
+        } elseif (str_starts_with($line, "\xFE\xFF")) {
+            // UTF-16BE BOM
+            $line = substr($line, 2);
+            $line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE') : preg_replace('/\x00/', '', $line);
+        } elseif ($hasNulls) {
+            // Likely UTF-16 without BOM, try LE then BE
+            if (function_exists('mb_convert_encoding')) {
+                $try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE');
+                if ($try !== false) {
+                    $line = $try;
+                } else {
+                    $try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE');
+                    if ($try !== false) {
+                        $line = $try;
+                    } else {
+                        $line = preg_replace('/\x00/', '', $line);
+                    }
+                }
+            } else {
+                $line = preg_replace('/\x00/', '', $line);
+            }
+        } else {
+            // Non UTF-16: try detect common encodings and convert to UTF-8 if needed
+            if (function_exists('mb_detect_encoding') && function_exists('mb_convert_encoding')) {
+                // Use default detection order for portability across environments
+                $enc = @mb_detect_encoding($line, null, true);
+                if ($enc && strtoupper($enc) !== 'UTF-8') {
+                    $line = @mb_convert_encoding($line, 'UTF-8', $enc) ?: $line;
+                }
+            }
+        }
+        // Replace non-breaking space with regular space
+        $line = str_replace("\xC2\xA0", ' ', $line);
+
+        return $line;
+    }
+
+    /**
+     * Read the first meaningful (non-empty after normalization) line of a file; returns null on failure.
+     * Skips BOM-only lines and leading blank lines. Limits scanning to first 50 lines to be safe.
+     */
+    public function readFirstMeaningfulLine(string $path): ?string
    {
        $fh = @fopen($path, 'r');
-        if (!$fh) return null;
-        $line = fgets($fh);
+        if (! $fh) {
+            return null;
+        }
+        $line = null;
+        $limit = 50;
+        while ($limit-- > 0 && ($raw = fgets($fh)) !== false) {
+            $normalized = $this->normalizeLine($raw);
+            if (trim($normalized) !== '') {
+                $line = $normalized;
+                break;
+            }
+        }
        fclose($fh);
-        return $line === false ? null : $line;
+
+        return $line;
    }

    /**
@@ -24,14 +88,15 @@ public function readFirstLine(string $path): ?string
    public function detectColumnsFromCsv(string $path, bool $hasHeader): array
    {
        // Use actual tab character for TSV; keep other common delimiters
-        $delims = [',',';','|',"\t"]; 
+        $delims = [',', ';', '|', "\t"];
        $bestDelim = ',';
        $bestCols = [];

-        $firstLine = $this->readFirstLine($path);
+        $firstLine = $this->readFirstMeaningfulLine($path);
        if ($firstLine === null) {
            return [$bestDelim, []];
        }
+        // Already normalized by readFirstMeaningfulLine

        $maxCount = 0;
        foreach ($delims as $d) {
@@ -44,12 +109,27 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
            }
        }

-        if (!$hasHeader) {
+        // Fallback: if str_getcsv failed to split but we clearly see delimiters, do a simple explode
+        if ($maxCount <= 1) {
+            foreach (["\t", ';', ',', '|'] as $d) {
+                if (substr_count($firstLine, $d) >= 1) {
+                    $parts = explode($d, $firstLine);
+                    if (count($parts) > $maxCount) {
+                        $bestDelim = $d;
+                        $bestCols = $parts;
+                        $maxCount = count($parts);
+                    }
+                }
+            }
+        }
+
+        if (! $hasHeader) {
            // return positional indices 0..N-1
            $cols = [];
            for ($i = 0; $i < $maxCount; $i++) {
                $cols[] = (string) $i;
            }
+
            return [$bestDelim, $cols];
        }

@@ -57,6 +137,7 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
        $clean = array_map(function ($v) {
            $v = trim((string) $v);
            $v = preg_replace('/\s+/', ' ', $v);
+
            return $v;
        }, $bestCols);

@@ -69,16 +150,23 @@ public function detectColumnsFromCsv(string $path, bool $hasHeader): array
     */
    public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHeader): array
    {
-        $firstLine = $this->readFirstLine($path);
+        $firstLine = $this->readFirstMeaningfulLine($path);
        if ($firstLine === null) {
            return [];
        }
+        // Already normalized by readFirstMeaningfulLine
        $row = str_getcsv($firstLine, $delimiter);
        $count = is_array($row) ? count($row) : 0;
+        // Fallback explode if str_getcsv failed to split
+        if ($count <= 1 && substr_count($firstLine, $delimiter) >= 1) {
+            $row = explode($delimiter, $firstLine);
+            $count = count($row);
+        }
        if ($hasHeader) {
            return array_map(function ($v) {
                $v = trim((string) $v);
                $v = preg_replace('/\s+/', ' ', $v);
+
                return $v;
            }, $row ?: []);
        }
@@ -86,6 +174,7 @@ public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHe
        for ($i = 0; $i < $count; $i++) {
            $cols[] = (string) $i;
        }
+
        return $cols;
    }
 }