258 lines
8.4 KiB
PHP
258 lines
8.4 KiB
PHP
<?php
|
|
|
|
namespace App\Services;
|
|
|
|
use App\Models\Article;
|
|
use App\Models\Category;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Illuminate\Support\Str;
|
|
|
|
class HelpdeskImportService
|
|
{
|
|
private const DEFAULT_BASE_URL = 'https://www.internettoday.nl/helpdesk';
|
|
|
|
public function import(?string $baseUrl = null, bool $dryRun = false, ?int $limit = null, ?callable $progress = null): array
|
|
{
|
|
$baseUrl = rtrim($baseUrl ?: self::DEFAULT_BASE_URL, '/');
|
|
|
|
$rootHtml = $this->fetch($baseUrl);
|
|
$categories = $this->extractCategories($rootHtml);
|
|
|
|
$categoryMap = $this->syncCategories($categories, $dryRun);
|
|
$sections = $this->buildSections($baseUrl, $categories);
|
|
|
|
$articleUrlMap = $this->collectArticleUrls($baseUrl, $rootHtml, $sections);
|
|
if ($limit !== null && $limit > 0) {
|
|
$articleUrlMap = array_slice($articleUrlMap, 0, $limit, true);
|
|
}
|
|
|
|
$total = count($articleUrlMap);
|
|
$imported = 0;
|
|
$updated = 0;
|
|
$skipped = 0;
|
|
$processed = 0;
|
|
|
|
foreach ($articleUrlMap as $articleUrl => $meta) {
|
|
$processed++;
|
|
$parsed = $this->parseArticlePage($articleUrl);
|
|
if ($parsed === null) {
|
|
$skipped++;
|
|
$progress && $progress($processed, $total, $articleUrl, 'skipped');
|
|
|
|
continue;
|
|
}
|
|
|
|
if ($dryRun) {
|
|
$imported++;
|
|
$progress && $progress($processed, $total, $articleUrl, 'dry-run');
|
|
|
|
continue;
|
|
}
|
|
|
|
[$title, $content, $sourceArticleId] = $parsed;
|
|
|
|
$categoryId = $this->resolveCategoryId($meta['category_external_id'] ?? null, $categoryMap);
|
|
$subcategoryId = $this->resolveCategoryId($meta['subcategory_external_id'] ?? null, $categoryMap);
|
|
|
|
$result = Article::query()->updateOrCreate(
|
|
['source' => 'internettoday_helpdesk', 'source_article_id' => $sourceArticleId],
|
|
[
|
|
'title' => $title,
|
|
'content' => $content,
|
|
'source_url' => $articleUrl,
|
|
'category_id' => $categoryId,
|
|
'subcategory_id' => $subcategoryId,
|
|
]
|
|
);
|
|
|
|
if ($result->wasRecentlyCreated) {
|
|
$imported++;
|
|
$progress && $progress($processed, $total, $articleUrl, 'imported');
|
|
} else {
|
|
$updated++;
|
|
$progress && $progress($processed, $total, $articleUrl, 'updated');
|
|
}
|
|
}
|
|
|
|
return [
|
|
'categories' => count($categories),
|
|
'sections' => count($sections),
|
|
'article_urls' => $total,
|
|
'imported' => $imported,
|
|
'updated' => $updated,
|
|
'skipped' => $skipped,
|
|
'dry_run' => $dryRun,
|
|
];
|
|
}
|
|
|
|
private function fetch(string $url): string
|
|
{
|
|
return Http::timeout(30)->retry(2, 300)->get($url)->throw()->body();
|
|
}
|
|
|
|
private function extractCategories(string $html): array
|
|
{
|
|
if (! preg_match('/const\s+categories\s*=\s*(\[.*?\]);/s', $html, $matches)) {
|
|
return [];
|
|
}
|
|
|
|
$decoded = json_decode($matches[1], true);
|
|
|
|
return is_array($decoded) ? $decoded : [];
|
|
}
|
|
|
|
private function syncCategories(array $categories, bool $dryRun): array
|
|
{
|
|
$map = [];
|
|
foreach ($categories as $category) {
|
|
if (! isset($category['id'], $category['title'], $category['slug'])) {
|
|
continue;
|
|
}
|
|
|
|
$parentId = null;
|
|
if (! $dryRun) {
|
|
$model = Category::query()->updateOrCreate(
|
|
['external_id' => (int) $category['id']],
|
|
['name' => (string) $category['title'], 'slug' => (string) $category['slug'], 'parent_id' => null]
|
|
);
|
|
$parentId = $model->id;
|
|
}
|
|
|
|
$map[(int) $category['id']] = $parentId;
|
|
|
|
foreach (($category['children'] ?? []) as $child) {
|
|
if (! isset($child['id'], $child['title'], $child['slug'])) {
|
|
continue;
|
|
}
|
|
|
|
if (! $dryRun) {
|
|
$childModel = Category::query()->updateOrCreate(
|
|
['external_id' => (int) $child['id']],
|
|
['name' => (string) $child['title'], 'slug' => (string) $child['slug'], 'parent_id' => $parentId]
|
|
);
|
|
$map[(int) $child['id']] = $childModel->id;
|
|
} else {
|
|
$map[(int) $child['id']] = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $map;
|
|
}
|
|
|
|
private function buildSections(string $baseUrl, array $categories): array
|
|
{
|
|
$sections = [];
|
|
foreach ($categories as $category) {
|
|
if (! isset($category['id'], $category['slug'])) {
|
|
continue;
|
|
}
|
|
|
|
$sections[] = [
|
|
'url' => sprintf('%s/%d/%s', $baseUrl, (int) $category['id'], (string) $category['slug']),
|
|
'category_external_id' => (int) $category['id'],
|
|
'subcategory_external_id' => null,
|
|
];
|
|
|
|
foreach (($category['children'] ?? []) as $child) {
|
|
if (! isset($child['id'], $child['slug'])) {
|
|
continue;
|
|
}
|
|
|
|
$sections[] = [
|
|
'url' => sprintf('%s/%d/%s', $baseUrl, (int) $child['id'], (string) $child['slug']),
|
|
'category_external_id' => (int) $category['id'],
|
|
'subcategory_external_id' => (int) $child['id'],
|
|
];
|
|
}
|
|
}
|
|
|
|
return $sections;
|
|
}
|
|
|
|
private function collectArticleUrls(string $baseUrl, string $rootHtml, array $sections): array
|
|
{
|
|
$result = [];
|
|
$sources = array_merge([
|
|
['url' => $baseUrl, 'category_external_id' => null, 'subcategory_external_id' => null, 'html' => $rootHtml],
|
|
], $sections);
|
|
|
|
foreach ($sources as $source) {
|
|
try {
|
|
$html = array_key_exists('html', $source)
|
|
? (string) $source['html']
|
|
: $this->fetch((string) $source['url']);
|
|
} catch (\Throwable) {
|
|
continue;
|
|
}
|
|
|
|
preg_match_all('/https:\/\/www\.internettoday\.nl\/helpdesk\/(\d+)-[a-z0-9\-]+/i', $html, $matches);
|
|
foreach ($matches[0] as $match) {
|
|
$url = strtolower($match);
|
|
if (! isset($result[$url])) {
|
|
$result[$url] = [
|
|
'category_external_id' => $source['category_external_id'],
|
|
'subcategory_external_id' => $source['subcategory_external_id'],
|
|
];
|
|
}
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
private function parseArticlePage(string $url): ?array
|
|
{
|
|
try {
|
|
$html = $this->fetch($url);
|
|
} catch (\Throwable) {
|
|
return null;
|
|
}
|
|
|
|
if (! preg_match('/<h1[^>]*>(.*?)<\/h1>/is', $html, $titleMatch)) {
|
|
return null;
|
|
}
|
|
|
|
$title = $this->sanitizeText($titleMatch[1]);
|
|
if ($title === '') {
|
|
return null;
|
|
}
|
|
|
|
if (! preg_match('/<div\s+class="main_1_column">\s*(<p.*?<\/p>)\s*<\/div>/is', $html, $contentMatch)) {
|
|
return null;
|
|
}
|
|
|
|
$contentRaw = $contentMatch[1];
|
|
$contentRaw = preg_replace('/<\s*br\s*\/?\s*>/i', "\n", $contentRaw) ?? $contentRaw;
|
|
$contentRaw = preg_replace('/<\/p>\s*<p[^>]*>/i', "\n\n", $contentRaw) ?? $contentRaw;
|
|
$content = $this->sanitizeText($contentRaw);
|
|
|
|
if ($content === '') {
|
|
return null;
|
|
}
|
|
|
|
if (! preg_match('/\/helpdesk\/(\d+)-/', $url, $idMatch)) {
|
|
return null;
|
|
}
|
|
|
|
return [$title, Str::limit($content, 64000, ''), (int) $idMatch[1]];
|
|
}
|
|
|
|
private function resolveCategoryId(?int $externalId, array $map): ?int
|
|
{
|
|
if ($externalId === null) {
|
|
return null;
|
|
}
|
|
|
|
return $map[$externalId] ?? Category::query()->where('external_id', $externalId)->value('id');
|
|
}
|
|
|
|
private function sanitizeText(string $value): string
|
|
{
|
|
$decoded = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
$decoded = preg_replace('/\s+/', ' ', $decoded) ?? $decoded;
|
|
|
|
return trim($decoded);
|
|
}
|
|
}
|