Files
TicketAssistent/app/Services/HelpdeskImportService.php

171 lines
5.0 KiB
PHP

<?php
namespace App\Services;
use App\Models\Article;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
class HelpdeskImportService
{
private const DEFAULT_BASE_URL = 'https://www.internettoday.nl/helpdesk';
public function import(?string $baseUrl = null, bool $dryRun = false, ?int $limit = null): array
{
$baseUrl = rtrim($baseUrl ?: self::DEFAULT_BASE_URL, '/');
$rootHtml = $this->fetch($baseUrl);
$categories = $this->extractCategories($rootHtml);
$sectionUrls = $this->buildSectionUrls($baseUrl, $categories);
$articleUrls = $this->collectArticleUrls($baseUrl, $rootHtml, $sectionUrls);
if ($limit !== null && $limit > 0) {
$articleUrls = array_slice($articleUrls, 0, $limit);
}
$imported = 0;
$updated = 0;
$skipped = 0;
foreach ($articleUrls as $articleUrl) {
$parsed = $this->parseArticlePage($articleUrl);
if ($parsed === null) {
$skipped++;
continue;
}
if ($dryRun) {
$imported++;
continue;
}
[$title, $content] = $parsed;
$result = Article::withoutEvents(function () use ($title, $content) {
return Article::query()->updateOrCreate(
['title' => $title],
['content' => $content]
);
});
if ($result->wasRecentlyCreated) {
$imported++;
} else {
$updated++;
}
}
return [
'categories' => count($categories),
'sections' => count($sectionUrls),
'article_urls' => count($articleUrls),
'imported' => $imported,
'updated' => $updated,
'skipped' => $skipped,
'dry_run' => $dryRun,
];
}
private function fetch(string $url): string
{
return Http::timeout(30)
->retry(2, 300)
->get($url)
->throw()
->body();
}
private function extractCategories(string $html): array
{
if (!preg_match('/const\s+categories\s*=\s*(\[.*?\]);/s', $html, $matches)) {
return [];
}
$decoded = json_decode($matches[1], true);
return is_array($decoded) ? $decoded : [];
}
private function buildSectionUrls(string $baseUrl, array $categories): array
{
$urls = [];
foreach ($categories as $category) {
if (!isset($category['id'], $category['slug'])) {
continue;
}
$urls[] = sprintf('%s/%d/%s', $baseUrl, (int) $category['id'], (string) $category['slug']);
foreach (($category['children'] ?? []) as $child) {
if (!isset($child['id'], $child['slug'])) {
continue;
}
$urls[] = sprintf('%s/%d/%s', $baseUrl, (int) $child['id'], (string) $child['slug']);
}
}
return array_values(array_unique($urls));
}
private function collectArticleUrls(string $baseUrl, string $rootHtml, array $sectionUrls): array
{
$urls = [];
foreach (array_merge([$baseUrl], $sectionUrls) as $url) {
try {
$html = $url === $baseUrl ? $rootHtml : $this->fetch($url);
} catch (\Throwable) {
continue;
}
preg_match_all('/https:\/\/www\.internettoday\.nl\/helpdesk\/(\d+)-[a-z0-9\-]+/i', $html, $matches);
foreach (($matches[0] ?? []) as $match) {
$urls[] = strtolower($match);
}
}
return array_values(array_unique($urls));
}
private function parseArticlePage(string $url): ?array
{
try {
$html = $this->fetch($url);
} catch (\Throwable) {
return null;
}
if (!preg_match('/<h1[^>]*>(.*?)<\/h1>/is', $html, $titleMatch)) {
return null;
}
$title = $this->sanitizeText($titleMatch[1]);
if ($title === '') {
return null;
}
if (!preg_match('/<div\s+class="main_1_column">\s*(<p.*?<\/p>)\s*<\/div>/is', $html, $contentMatch)) {
return null;
}
$contentRaw = $contentMatch[1];
$contentRaw = preg_replace('/<\s*br\s*\/?\s*>/i', "\n", $contentRaw) ?? $contentRaw;
$contentRaw = preg_replace('/<\/p>\s*<p[^>]*>/i', "\n\n", $contentRaw) ?? $contentRaw;
$content = $this->sanitizeText($contentRaw);
if ($content === '') {
return null;
}
$content = "Source: {$url}\n\n{$content}";
return [$title, Str::limit($content, 64000, '')];
}
private function sanitizeText(string $value): string
{
$decoded = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$decoded = preg_replace('/\s+/', ' ', $decoded) ?? $decoded;
return trim($decoded);
}
}