Document gen fixed

This commit is contained in:
2025-10-12 17:52:17 +02:00
parent e0303ece74
commit 23f2011e33
16 changed files with 1116 additions and 88 deletions
+110
View File
@@ -0,0 +1,110 @@
<?php
namespace App\Console\Commands;
use App\Models\Contract;
use App\Models\DocumentTemplate;
use App\Services\Documents\DocumentSettings;
use App\Services\Documents\TokenScanner;
use App\Services\Documents\TokenValueResolver;
use Illuminate\Console\Command;
class DocScanCommand extends Command
{
protected $signature = 'doc:scan {contract : Contract UUID} {xml : Path to Word document.xml}';
protected $description = 'Scan a Word document.xml for tokens and resolve values against a contract UUID';
public function handle(TokenScanner $scanner, TokenValueResolver $resolver, DocumentSettings $settings): int
{
$uuid = (string) $this->argument('contract');
$xmlPath = (string) $this->argument('xml');
if (! is_file($xmlPath)) {
$this->error("XML file not found: {$xmlPath}");
return self::FAILURE;
}
$xml = file_get_contents($xmlPath);
if ($xml === false) {
$this->error('Unable to read XML file.');
return self::FAILURE;
}
$contract = Contract::where('uuid', $uuid)->first();
if (! $contract) {
$this->error("Contract not found for UUID: {$uuid}");
return self::FAILURE;
}
// Normalize common Word run boundaries so tokens appear contiguous
$norm = $this->normalizeRunsForTokens($xml);
$tokens = $scanner->scan($norm);
$this->info('Detected tokens:');
foreach ($tokens as $t) {
$this->line(" - {$t}");
}
if (empty($tokens)) {
$this->warn('No tokens detected.');
}
// Build a minimal in-memory template using global whitelist so we can resolve values
$whitelist = $settings->get()->whitelist ?? [];
if (! is_array($whitelist)) {
$whitelist = [];
}
$entities = array_keys($whitelist);
$template = new DocumentTemplate([
'entities' => $entities,
'columns' => $whitelist,
'fail_on_unresolved' => false,
'formatting_options' => [],
'meta' => [],
]);
// Resolve values using a relaxed policy to avoid exceptions on unknowns
$user = auth()->user() ?? (\App\Models\User::query()->first() ?: new \App\Models\User(['name' => 'System']));
$resolved = $resolver->resolve($tokens, $template, $contract, $user, policy: 'blank');
$values = $resolved['values'] ?? [];
$unresolved = $resolved['unresolved'] ?? [];
$this->info('Resolved values:');
foreach ($values as $k => $v) {
$short = strlen((string) $v) > 120 ? substr((string) $v, 0, 117).'...' : (string) $v;
$this->line(" - {$k} => {$short}");
}
if (! empty($unresolved)) {
$this->warn('Unresolved tokens:');
foreach ($unresolved as $u) {
$this->line(" - {$u}");
}
}
return self::SUCCESS;
}
private function normalizeRunsForTokens(string $xml): string
{
// Remove proofing error spans that may split content
$xml = preg_replace('#<w:proofErr[^>]*/>#i', '', $xml) ?? $xml;
// Iteratively collapse boundaries between text runs, even if w:rPr is present
$patterns = [
// </w:t></w:r> [optional proofErr] <w:r ...> [optional rPr] <w:t>
'#</w:t>\s*</w:r>\s*(?:<w:proofErr[^>]*/>\s*)*(?:<w:r[^>]*>\s*(?:<w:rPr>.*?</w:rPr>\s*)*)?<w:t[^>]*>#is',
];
$prev = null;
while ($prev !== $xml) {
$prev = $xml;
foreach ($patterns as $pat) {
$xml = preg_replace($pat, '', $xml) ?? $xml;
}
}
// Remove zero-width and soft hyphen characters
$xml = str_replace(["\xE2\x80\x8B", "\xC2\xAD"], '', $xml);
return $xml;
}
}
@@ -0,0 +1,152 @@
<?php
namespace App\Console\Commands;
use App\Models\DocumentTemplate;
use App\Services\Documents\TokenScanner;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Storage;
use ZipArchive;
class TemplateScanCommand extends Command
{
protected $signature = 'template:scan {slug : Template slug} {--tpl-version= : Specific template version number} {--parts : Show per-part tokens}';
protected $description = 'Scan a stored DOCX template by slug/version and dump detected tokens directly from storage.';
public function handle(TokenScanner $scanner): int
{
$slug = (string) $this->argument('slug');
$version = $this->option('tpl-version');
/** @var DocumentTemplate|null $template */
$query = DocumentTemplate::query()->where('slug', $slug);
if (! empty($version)) {
$query->where('version', (int) $version);
} else {
$query->orderByDesc('version');
}
$template = $query->first();
if (! $template) {
$this->error("Template not found for slug '{$slug}'".($version ? " v{$version}" : ''));
return self::FAILURE;
}
$disk = 'public';
$path = $template->file_path;
if (! $path || ! Storage::disk($disk)->exists($path)) {
$this->error('Template file not found on disk: '.$path);
return self::FAILURE;
}
$bytes = Storage::disk($disk)->get($path);
$tmp = tempnam(sys_get_temp_dir(), 'tmpl');
file_put_contents($tmp, $bytes);
$zip = new ZipArchive;
if ($zip->open($tmp) !== true) {
$this->error('Unable to open DOCX (zip).');
return self::FAILURE;
}
// Collect parts: main + headers/footers + notes/comments
$parts = [];
$doc = $zip->getFromName('word/document.xml');
if ($doc !== false) {
$parts['word/document.xml'] = $doc;
}
for ($i = 0; $i < $zip->numFiles; $i++) {
$name = $zip->getNameIndex($i);
if (! is_string($name)) {
continue;
}
if (preg_match('#^word/(header\d*|footer\d*|footnotes|endnotes|comments)\.xml$#i', $name)) {
$xml = $zip->getFromName($name);
if ($xml !== false) {
$parts[$name] = $xml;
}
}
}
// Normalize and scan
$all = [];
$perPart = [];
foreach ($parts as $name => $xml) {
$norm = $this->normalizeRunsForTokens($xml);
$found = $scanner->scan($norm);
$perPart[$name] = $found;
if ($found) {
$all = array_merge($all, $found);
}
}
$union = array_values(array_unique($all));
$this->info("Template: {$template->name} (slug={$template->slug}, v{$template->version})");
$this->line('File: '.$path);
$this->line('Tokens found (union): '.count($union));
foreach ($union as $t) {
$this->line(' - '.$t);
}
if ($this->option('parts')) {
$this->line('');
$this->info('Per-part details:');
foreach ($perPart as $n => $list) {
$this->line("[{$n}] (".count($list).')');
foreach ($list as $t) {
$this->line(' - '.$t);
}
}
}
$zip->close();
@unlink($tmp);
return self::SUCCESS;
}
private function normalizeRunsForTokens(string $xml): string
{
// Remove proofing error markers
$xml = preg_replace('#<w:proofErr[^>]*/>#i', '', $xml) ?? $xml;
// Collapse boundaries between runs and inside runs (include tabs/line breaks)
$patterns = [
'#</w:t>\s*</w:r>\s*(?:<(?:w:proofErr|w:tab|w:br)[^>]*/>\s*)*(?:<w:r[^>]*>\s*(?:<w:rPr>.*?</w:rPr>\s*)*)?<w:t[^>]*>#is',
'#</w:t>\s*(?:<(?:w:proofErr|w:tab|w:br)[^>]*/>\s*)*<w:t[^>]*>#is',
];
$prev = null;
while ($prev !== $xml) {
$prev = $xml;
foreach ($patterns as $pat) {
$xml = preg_replace($pat, '', $xml) ?? $xml;
}
}
// Clean inside {{ ... }}
$xml = preg_replace_callback('/\{\{.*?\}\}/s', function (array $m) {
$inner = substr($m[0], 2, -2);
$inner = preg_replace('/<[^>]+>/', '', $inner) ?? $inner;
$inner = preg_replace('/\s+/', '', $inner) ?? $inner;
return '{{'.$inner.'}}';
}, $xml) ?? $xml;
// Clean inside { ... } if it looks like a token
$xml = preg_replace_callback('/\{[^{}]*\}/s', function (array $m) {
$raw = $m[0];
$inner = substr($raw, 1, -1);
$clean = preg_replace('/<[^>]+>/', '', $inner) ?? $inner;
$clean = preg_replace('/\s+/', '', $clean) ?? $clean;
if (preg_match('/^[a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*\.[a-zA-Z0-9_.-]+$/', $clean)) {
return '{'.$clean.'}';
}
return $raw;
}, $xml) ?? $xml;
// Remove zero-width and soft hyphen
$xml = str_replace(["\xE2\x80\x8B", "\xC2\xAD"], '', $xml);
return $xml;
}
}