Files
TicketAssistent/app/Services/HelpdeskImportService.php

258 lines
8.4 KiB
PHP

<?php
namespace App\Services;
use App\Models\Article;
use App\Models\Category;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Str;
class HelpdeskImportService
{
private const DEFAULT_BASE_URL = 'https://www.internettoday.nl/helpdesk';
public function import(?string $baseUrl = null, bool $dryRun = false, ?int $limit = null, ?callable $progress = null): array
{
$baseUrl = rtrim($baseUrl ?: self::DEFAULT_BASE_URL, '/');
$rootHtml = $this->fetch($baseUrl);
$categories = $this->extractCategories($rootHtml);
$categoryMap = $this->syncCategories($categories, $dryRun);
$sections = $this->buildSections($baseUrl, $categories);
$articleUrlMap = $this->collectArticleUrls($baseUrl, $rootHtml, $sections);
if ($limit !== null && $limit > 0) {
$articleUrlMap = array_slice($articleUrlMap, 0, $limit, true);
}
$total = count($articleUrlMap);
$imported = 0;
$updated = 0;
$skipped = 0;
$processed = 0;
foreach ($articleUrlMap as $articleUrl => $meta) {
$processed++;
$parsed = $this->parseArticlePage($articleUrl);
if ($parsed === null) {
$skipped++;
$progress && $progress($processed, $total, $articleUrl, 'skipped');
continue;
}
if ($dryRun) {
$imported++;
$progress && $progress($processed, $total, $articleUrl, 'dry-run');
continue;
}
[$title, $content, $sourceArticleId] = $parsed;
$categoryId = $this->resolveCategoryId($meta['category_external_id'] ?? null, $categoryMap);
$subcategoryId = $this->resolveCategoryId($meta['subcategory_external_id'] ?? null, $categoryMap);
$result = Article::query()->updateOrCreate(
['source' => 'internettoday_helpdesk', 'source_article_id' => $sourceArticleId],
[
'title' => $title,
'content' => $content,
'source_url' => $articleUrl,
'category_id' => $categoryId,
'subcategory_id' => $subcategoryId,
]
);
if ($result->wasRecentlyCreated) {
$imported++;
$progress && $progress($processed, $total, $articleUrl, 'imported');
} else {
$updated++;
$progress && $progress($processed, $total, $articleUrl, 'updated');
}
}
return [
'categories' => count($categories),
'sections' => count($sections),
'article_urls' => $total,
'imported' => $imported,
'updated' => $updated,
'skipped' => $skipped,
'dry_run' => $dryRun,
];
}
private function fetch(string $url): string
{
return Http::timeout(30)->retry(2, 300)->get($url)->throw()->body();
}
private function extractCategories(string $html): array
{
if (! preg_match('/const\s+categories\s*=\s*(\[.*?\]);/s', $html, $matches)) {
return [];
}
$decoded = json_decode($matches[1], true);
return is_array($decoded) ? $decoded : [];
}
private function syncCategories(array $categories, bool $dryRun): array
{
$map = [];
foreach ($categories as $category) {
if (! isset($category['id'], $category['title'], $category['slug'])) {
continue;
}
$parentId = null;
if (! $dryRun) {
$model = Category::query()->updateOrCreate(
['external_id' => (int) $category['id']],
['name' => (string) $category['title'], 'slug' => (string) $category['slug'], 'parent_id' => null]
);
$parentId = $model->id;
}
$map[(int) $category['id']] = $parentId;
foreach (($category['children'] ?? []) as $child) {
if (! isset($child['id'], $child['title'], $child['slug'])) {
continue;
}
if (! $dryRun) {
$childModel = Category::query()->updateOrCreate(
['external_id' => (int) $child['id']],
['name' => (string) $child['title'], 'slug' => (string) $child['slug'], 'parent_id' => $parentId]
);
$map[(int) $child['id']] = $childModel->id;
} else {
$map[(int) $child['id']] = null;
}
}
}
return $map;
}
private function buildSections(string $baseUrl, array $categories): array
{
$sections = [];
foreach ($categories as $category) {
if (! isset($category['id'], $category['slug'])) {
continue;
}
$sections[] = [
'url' => sprintf('%s/%d/%s', $baseUrl, (int) $category['id'], (string) $category['slug']),
'category_external_id' => (int) $category['id'],
'subcategory_external_id' => null,
];
foreach (($category['children'] ?? []) as $child) {
if (! isset($child['id'], $child['slug'])) {
continue;
}
$sections[] = [
'url' => sprintf('%s/%d/%s', $baseUrl, (int) $child['id'], (string) $child['slug']),
'category_external_id' => (int) $category['id'],
'subcategory_external_id' => (int) $child['id'],
];
}
}
return $sections;
}
private function collectArticleUrls(string $baseUrl, string $rootHtml, array $sections): array
{
$result = [];
$sources = array_merge([
['url' => $baseUrl, 'category_external_id' => null, 'subcategory_external_id' => null, 'html' => $rootHtml],
], $sections);
foreach ($sources as $source) {
try {
$html = array_key_exists('html', $source)
? (string) $source['html']
: $this->fetch((string) $source['url']);
} catch (\Throwable) {
continue;
}
preg_match_all('/https:\/\/www\.internettoday\.nl\/helpdesk\/(\d+)-[a-z0-9\-]+/i', $html, $matches);
foreach ($matches[0] as $match) {
$url = strtolower($match);
if (! isset($result[$url])) {
$result[$url] = [
'category_external_id' => $source['category_external_id'],
'subcategory_external_id' => $source['subcategory_external_id'],
];
}
}
}
return $result;
}
private function parseArticlePage(string $url): ?array
{
try {
$html = $this->fetch($url);
} catch (\Throwable) {
return null;
}
if (! preg_match('/<h1[^>]*>(.*?)<\/h1>/is', $html, $titleMatch)) {
return null;
}
$title = $this->sanitizeText($titleMatch[1]);
if ($title === '') {
return null;
}
if (! preg_match('/<div\s+class="main_1_column">\s*(<p.*?<\/p>)\s*<\/div>/is', $html, $contentMatch)) {
return null;
}
$contentRaw = $contentMatch[1];
$contentRaw = preg_replace('/<\s*br\s*\/?\s*>/i', "\n", $contentRaw) ?? $contentRaw;
$contentRaw = preg_replace('/<\/p>\s*<p[^>]*>/i', "\n\n", $contentRaw) ?? $contentRaw;
$content = $this->sanitizeText($contentRaw);
if ($content === '') {
return null;
}
if (! preg_match('/\/helpdesk\/(\d+)-/', $url, $idMatch)) {
return null;
}
return [$title, Str::limit($content, 64000, ''), (int) $idMatch[1]];
}
private function resolveCategoryId(?int $externalId, array $map): ?int
{
if ($externalId === null) {
return null;
}
return $map[$externalId] ?? Category::query()->where('external_id', $externalId)->value('id');
}
private function sanitizeText(string $value): string
{
$decoded = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$decoded = preg_replace('/\s+/', ' ', $decoded) ?? $decoded;
return trim($decoded);
}
}