fallbackNormalize($original); $fallbackLanguage = $this->detectLanguage($original); if (! (bool) config('services.llm.ranking_enabled', true)) { return [ 'normalized_message' => $fallback['text'], 'redaction_report' => [ 'mode' => 'fallback_regex', 'pii_types' => $fallback['pii_types'], 'language' => $fallbackLanguage, 'reason' => 'llm_normalization_disabled', ], ]; } $basePrompt = $this->settings->getPrompt('normalization', 'Rewrite and redact PII. Return JSON.'); $prompt = $basePrompt."\n\n". 'Also detect the original language. Return JSON with keys: normalized_message, redaction_report. '. "redaction_report must include pii_types and language as an ISO 639-1 code such as nl, en, de, fr.\n\n". "Original question:\n\"\"\"\n{$original}\n\"\"\""; try { $raw = $this->llmClient->generate($prompt, ['expect_json' => true, 'task' => 'normalization']); $decoded = $this->decodeJsonResponse($raw); if (is_array($decoded) && ! empty($decoded['normalized_message'])) { return [ 'normalized_message' => (string) $decoded['normalized_message'], 'redaction_report' => [ 'mode' => 'llm', 'pii_types' => $decoded['redaction_report']['pii_types'] ?? [], 'language' => $this->normalizeLanguageCode($decoded['redaction_report']['language'] ?? $fallbackLanguage), 'notes' => $decoded['redaction_report']['notes'] ?? null, 'raw' => $decoded, ], ]; } } catch (\Throwable $e) { return [ 'normalized_message' => $fallback['text'], 'redaction_report' => [ 'mode' => 'fallback_regex', 'pii_types' => $fallback['pii_types'], 'language' => $fallbackLanguage, 'reason' => 'llm_exception', 'error' => $e->getMessage(), ], ]; } return [ 'normalized_message' => $fallback['text'], 'redaction_report' => [ 'mode' => 'fallback_regex', 'pii_types' => $fallback['pii_types'], 'language' => $fallbackLanguage, 'reason' => 'llm_invalid_json_or_missing_fields', ], ]; } private function fallbackNormalize(string $text): array { $pii = []; // Replace highly-structured values first so looser patterns (phone) do not corrupt them. $orderedPatterns = [ 'email' => '/[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}/i', 'iban' => '/\b[A-Z]{2}[0-9]{2}[A-Z0-9]{10,30}\b/i', 'url' => '/https?:\/\/\S+/i', 'ip' => '/\b(?:\d{1,3}\.){3}\d{1,3}\b/', // Require separators to avoid matching long account-like sequences. 'phone' => '/(? $pattern) { if (preg_match($pattern, $text) === 1) { $pii[] = $type; $text = preg_replace($pattern, '['.strtoupper($type).']', $text) ?? $text; } } $text = preg_replace('/\s+/', ' ', trim($text)) ?? trim($text); return ['text' => $text, 'pii_types' => array_values(array_unique($pii))]; } private function detectLanguage(string $text): string { $lower = mb_strtolower($text); $dutchSignals = [' ik ', ' mijn ', ' een ', ' het ', ' de ', ' hoe ', ' niet ', ' wordt ', ' domeinnaam ', ' website ']; $englishSignals = [' i ', ' my ', ' the ', ' how ', ' not ', ' website ', ' domain ', ' redirected ']; $padded = ' '.$lower.' '; $nl = 0; $en = 0; foreach ($dutchSignals as $signal) { $nl += substr_count($padded, $signal); } foreach ($englishSignals as $signal) { $en += substr_count($padded, $signal); } return $en > $nl ? 'en' : 'nl'; } private function normalizeLanguageCode(mixed $language): string { $value = mb_strtolower(trim((string) $language)); return match (true) { str_starts_with($value, 'nl'), str_contains($value, 'dutch'), str_contains($value, 'nederlands') => 'nl', str_starts_with($value, 'en'), str_contains($value, 'english') => 'en', str_starts_with($value, 'de'), str_contains($value, 'german'), str_contains($value, 'duits') => 'de', str_starts_with($value, 'fr'), str_contains($value, 'french'), str_contains($value, 'frans') => 'fr', default => 'nl', }; } private function decodeJsonResponse(string $raw): ?array { $raw = trim($raw); $decoded = json_decode($raw, true); if (is_array($decoded)) { return $decoded; } if (preg_match('/```(?:json)?\s*(\{.*\})\s*```/is', $raw, $matches) === 1) { $decoded = json_decode(trim($matches[1]), true); if (is_array($decoded)) { return $decoded; } } $start = strpos($raw, '{'); $end = strrpos($raw, '}'); if ($start !== false && $end !== false && $end > $start) { $candidate = substr($raw, $start, $end - $start + 1); $decoded = json_decode($candidate, true); if (is_array($decoded)) { return $decoded; } } return null; } }