111 lines
3.8 KiB
PHP
111 lines
3.8 KiB
PHP
<?php
|
|
|
|
namespace App\Console\Commands;
|
|
|
|
use App\Models\Contract;
|
|
use App\Models\DocumentTemplate;
|
|
use App\Services\Documents\DocumentSettings;
|
|
use App\Services\Documents\TokenScanner;
|
|
use App\Services\Documents\TokenValueResolver;
|
|
use Illuminate\Console\Command;
|
|
|
|
class DocScanCommand extends Command
|
|
{
|
|
protected $signature = 'doc:scan {contract : Contract UUID} {xml : Path to Word document.xml}';
|
|
|
|
protected $description = 'Scan a Word document.xml for tokens and resolve values against a contract UUID';
|
|
|
|
public function handle(TokenScanner $scanner, TokenValueResolver $resolver, DocumentSettings $settings): int
|
|
{
|
|
$uuid = (string) $this->argument('contract');
|
|
$xmlPath = (string) $this->argument('xml');
|
|
|
|
if (! is_file($xmlPath)) {
|
|
$this->error("XML file not found: {$xmlPath}");
|
|
|
|
return self::FAILURE;
|
|
}
|
|
$xml = file_get_contents($xmlPath);
|
|
if ($xml === false) {
|
|
$this->error('Unable to read XML file.');
|
|
|
|
return self::FAILURE;
|
|
}
|
|
|
|
$contract = Contract::where('uuid', $uuid)->first();
|
|
if (! $contract) {
|
|
$this->error("Contract not found for UUID: {$uuid}");
|
|
|
|
return self::FAILURE;
|
|
}
|
|
|
|
// Normalize common Word run boundaries so tokens appear contiguous
|
|
$norm = $this->normalizeRunsForTokens($xml);
|
|
|
|
$tokens = $scanner->scan($norm);
|
|
$this->info('Detected tokens:');
|
|
foreach ($tokens as $t) {
|
|
$this->line(" - {$t}");
|
|
}
|
|
if (empty($tokens)) {
|
|
$this->warn('No tokens detected.');
|
|
}
|
|
|
|
// Build a minimal in-memory template using global whitelist so we can resolve values
|
|
$whitelist = $settings->get()->whitelist ?? [];
|
|
if (! is_array($whitelist)) {
|
|
$whitelist = [];
|
|
}
|
|
$entities = array_keys($whitelist);
|
|
$template = new DocumentTemplate([
|
|
'entities' => $entities,
|
|
'columns' => $whitelist,
|
|
'fail_on_unresolved' => false,
|
|
'formatting_options' => [],
|
|
'meta' => [],
|
|
]);
|
|
|
|
// Resolve values using a relaxed policy to avoid exceptions on unknowns
|
|
$user = auth()->user() ?? (\App\Models\User::query()->first() ?: new \App\Models\User(['name' => 'System']));
|
|
$resolved = $resolver->resolve($tokens, $template, $contract, $user, policy: 'blank');
|
|
$values = $resolved['values'] ?? [];
|
|
$unresolved = $resolved['unresolved'] ?? [];
|
|
|
|
$this->info('Resolved values:');
|
|
foreach ($values as $k => $v) {
|
|
$short = strlen((string) $v) > 120 ? substr((string) $v, 0, 117).'...' : (string) $v;
|
|
$this->line(" - {$k} => {$short}");
|
|
}
|
|
if (! empty($unresolved)) {
|
|
$this->warn('Unresolved tokens:');
|
|
foreach ($unresolved as $u) {
|
|
$this->line(" - {$u}");
|
|
}
|
|
}
|
|
|
|
return self::SUCCESS;
|
|
}
|
|
|
|
private function normalizeRunsForTokens(string $xml): string
|
|
{
|
|
// Remove proofing error spans that may split content
|
|
$xml = preg_replace('#<w:proofErr[^>]*/>#i', '', $xml) ?? $xml;
|
|
// Iteratively collapse boundaries between text runs, even if w:rPr is present
|
|
$patterns = [
|
|
// </w:t></w:r> [optional proofErr] <w:r ...> [optional rPr] <w:t>
|
|
'#</w:t>\s*</w:r>\s*(?:<w:proofErr[^>]*/>\s*)*(?:<w:r[^>]*>\s*(?:<w:rPr>.*?</w:rPr>\s*)*)?<w:t[^>]*>#is',
|
|
];
|
|
$prev = null;
|
|
while ($prev !== $xml) {
|
|
$prev = $xml;
|
|
foreach ($patterns as $pat) {
|
|
$xml = preg_replace($pat, '', $xml) ?? $xml;
|
|
}
|
|
}
|
|
// Remove zero-width and soft hyphen characters
|
|
$xml = str_replace(["\xE2\x80\x8B", "\xC2\xAD"], '', $xml);
|
|
|
|
return $xml;
|
|
}
|
|
}
|