Teren-app/app/Services/CsvImportService.php

<?php

namespace App\Services;

class CsvImportService
{
    /**
     * Normalize a line to UTF-8 and strip BOM / control characters for robust splitting.
     */
    private function normalizeLine(string $line): string
    {
        // Strip UTF-8 BOM
        if (str_starts_with($line, "\xEF\xBB\xBF")) {
            $line = substr($line, 3);
        }
        // Detect UTF-16 BOMs
        $hasNulls = strpos($line, "\x00") !== false;
        if (str_starts_with($line, "\xFF\xFE")) {
            // UTF-16LE BOM
            $line = substr($line, 2);
            $line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE') : preg_replace('/\x00/', '', $line);
        } elseif (str_starts_with($line, "\xFE\xFF")) {
            // UTF-16BE BOM
            $line = substr($line, 2);
            $line = function_exists('mb_convert_encoding') ? @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE') : preg_replace('/\x00/', '', $line);
        } elseif ($hasNulls) {
            // Likely UTF-16 without BOM, try LE then BE
            if (function_exists('mb_convert_encoding')) {
                $try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16LE');
                if ($try !== false) {
                    $line = $try;
                } else {
                    $try = @mb_convert_encoding($line, 'UTF-8', 'UTF-16BE');
                    if ($try !== false) {
                        $line = $try;
                    } else {
                        $line = preg_replace('/\x00/', '', $line);
                    }
                }
            } else {
                $line = preg_replace('/\x00/', '', $line);
            }
        } else {
            // Non UTF-16: try detect common encodings and convert to UTF-8 if needed
            if (function_exists('mb_detect_encoding') && function_exists('mb_convert_encoding')) {
                // Use default detection order for portability across environments
                $enc = @mb_detect_encoding($line, null, true);
                if ($enc && strtoupper($enc) !== 'UTF-8') {
                    $line = @mb_convert_encoding($line, 'UTF-8', $enc) ?: $line;
                }
            }
        }
        // Replace non-breaking space with regular space
        $line = str_replace("\xC2\xA0", ' ', $line);

        return $line;
    }

    /**
     * Read the first meaningful (non-empty after normalization) line of a file; returns null on failure.
     * Skips BOM-only lines and leading blank lines. Limits scanning to first 50 lines to be safe.
     */
    public function readFirstMeaningfulLine(string $path): ?string
    {
        $fh = @fopen($path, 'r');
        if (! $fh) {
            return null;
        }
        $line = null;
        $limit = 50;
        while ($limit-- > 0 && ($raw = fgets($fh)) !== false) {
            $normalized = $this->normalizeLine($raw);
            if (trim($normalized) !== '') {
                $line = $normalized;
                break;
            }
        }
        fclose($fh);

        return $line;
    }

    /**
     * Detect delimiter and return columns for first row.
     * If $hasHeader is false, returns positional indices instead of header names.
     * Returns [delimiter, columns].
     */
    public function detectColumnsFromCsv(string $path, bool $hasHeader): array
    {
        // Use actual tab character for TSV; keep other common delimiters
        $delims = [',', ';', '|', "\t"];
        $bestDelim = ',';
        $bestCols = [];

        $firstLine = $this->readFirstMeaningfulLine($path);
        if ($firstLine === null) {
            return [$bestDelim, []];
        }
        // Already normalized by readFirstMeaningfulLine

        $maxCount = 0;
        foreach ($delims as $d) {
            $row = str_getcsv($firstLine, $d);
            $count = is_array($row) ? count($row) : 0;
            if ($count > $maxCount) {
                $maxCount = $count;
                $bestDelim = $d;
                $bestCols = $row;
            }
        }

        // Fallback: if str_getcsv failed to split but we clearly see delimiters, do a simple explode
        if ($maxCount <= 1) {
            foreach (["\t", ';', ',', '|'] as $d) {
                if (substr_count($firstLine, $d) >= 1) {
                    $parts = explode($d, $firstLine);
                    if (count($parts) > $maxCount) {
                        $bestDelim = $d;
                        $bestCols = $parts;
                        $maxCount = count($parts);
                    }
                }
            }
        }

        if (! $hasHeader) {
            // return positional indices 0..N-1
            $cols = [];
            for ($i = 0; $i < $maxCount; $i++) {
                $cols[] = (string) $i;
            }

            return [$bestDelim, $cols];
        }

        // Clean header names
        $clean = array_map(function ($v) {
            $v = trim((string) $v);
            $v = preg_replace('/\s+/', ' ', $v);

            return $v;
        }, $bestCols);

        return [$bestDelim, $clean];
    }

    /**
     * Parse columns from CSV using a specific delimiter. If $hasHeader is false,
     * returns positional indices instead of header names.
     */
    public function parseColumnsFromCsv(string $path, string $delimiter, bool $hasHeader): array
    {
        $firstLine = $this->readFirstMeaningfulLine($path);
        if ($firstLine === null) {
            return [];
        }
        // Already normalized by readFirstMeaningfulLine
        $row = str_getcsv($firstLine, $delimiter);
        $count = is_array($row) ? count($row) : 0;
        // Fallback explode if str_getcsv failed to split
        if ($count <= 1 && substr_count($firstLine, $delimiter) >= 1) {
            $row = explode($delimiter, $firstLine);
            $count = count($row);
        }
        if ($hasHeader) {
            return array_map(function ($v) {
                $v = trim((string) $v);
                $v = preg_replace('/\s+/', ' ', $v);

                return $v;
            }, $row ?: []);
        }
        $cols = [];
        for ($i = 0; $i < $count; $i++) {
            $cols[] = (string) $i;
        }

        return $cols;
    }
}