171 lines
5.0 KiB
PHP
171 lines
5.0 KiB
PHP
<?php
|
|
|
|
namespace App\Services;
|
|
|
|
use App\Models\Article;
|
|
use Illuminate\Support\Facades\Http;
|
|
use Illuminate\Support\Str;
|
|
|
|
class HelpdeskImportService
|
|
{
|
|
private const DEFAULT_BASE_URL = 'https://www.internettoday.nl/helpdesk';
|
|
|
|
public function import(?string $baseUrl = null, bool $dryRun = false, ?int $limit = null): array
|
|
{
|
|
$baseUrl = rtrim($baseUrl ?: self::DEFAULT_BASE_URL, '/');
|
|
|
|
$rootHtml = $this->fetch($baseUrl);
|
|
$categories = $this->extractCategories($rootHtml);
|
|
|
|
$sectionUrls = $this->buildSectionUrls($baseUrl, $categories);
|
|
$articleUrls = $this->collectArticleUrls($baseUrl, $rootHtml, $sectionUrls);
|
|
if ($limit !== null && $limit > 0) {
|
|
$articleUrls = array_slice($articleUrls, 0, $limit);
|
|
}
|
|
|
|
$imported = 0;
|
|
$updated = 0;
|
|
$skipped = 0;
|
|
|
|
foreach ($articleUrls as $articleUrl) {
|
|
$parsed = $this->parseArticlePage($articleUrl);
|
|
if ($parsed === null) {
|
|
$skipped++;
|
|
continue;
|
|
}
|
|
|
|
if ($dryRun) {
|
|
$imported++;
|
|
continue;
|
|
}
|
|
|
|
[$title, $content] = $parsed;
|
|
$result = Article::withoutEvents(function () use ($title, $content) {
|
|
return Article::query()->updateOrCreate(
|
|
['title' => $title],
|
|
['content' => $content]
|
|
);
|
|
});
|
|
|
|
if ($result->wasRecentlyCreated) {
|
|
$imported++;
|
|
} else {
|
|
$updated++;
|
|
}
|
|
}
|
|
|
|
return [
|
|
'categories' => count($categories),
|
|
'sections' => count($sectionUrls),
|
|
'article_urls' => count($articleUrls),
|
|
'imported' => $imported,
|
|
'updated' => $updated,
|
|
'skipped' => $skipped,
|
|
'dry_run' => $dryRun,
|
|
];
|
|
}
|
|
|
|
private function fetch(string $url): string
|
|
{
|
|
return Http::timeout(30)
|
|
->retry(2, 300)
|
|
->get($url)
|
|
->throw()
|
|
->body();
|
|
}
|
|
|
|
private function extractCategories(string $html): array
|
|
{
|
|
if (!preg_match('/const\s+categories\s*=\s*(\[.*?\]);/s', $html, $matches)) {
|
|
return [];
|
|
}
|
|
|
|
$decoded = json_decode($matches[1], true);
|
|
return is_array($decoded) ? $decoded : [];
|
|
}
|
|
|
|
private function buildSectionUrls(string $baseUrl, array $categories): array
|
|
{
|
|
$urls = [];
|
|
foreach ($categories as $category) {
|
|
if (!isset($category['id'], $category['slug'])) {
|
|
continue;
|
|
}
|
|
|
|
$urls[] = sprintf('%s/%d/%s', $baseUrl, (int) $category['id'], (string) $category['slug']);
|
|
|
|
foreach (($category['children'] ?? []) as $child) {
|
|
if (!isset($child['id'], $child['slug'])) {
|
|
continue;
|
|
}
|
|
|
|
$urls[] = sprintf('%s/%d/%s', $baseUrl, (int) $child['id'], (string) $child['slug']);
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique($urls));
|
|
}
|
|
|
|
private function collectArticleUrls(string $baseUrl, string $rootHtml, array $sectionUrls): array
|
|
{
|
|
$urls = [];
|
|
|
|
foreach (array_merge([$baseUrl], $sectionUrls) as $url) {
|
|
try {
|
|
$html = $url === $baseUrl ? $rootHtml : $this->fetch($url);
|
|
} catch (\Throwable) {
|
|
continue;
|
|
}
|
|
|
|
preg_match_all('/https:\/\/www\.internettoday\.nl\/helpdesk\/(\d+)-[a-z0-9\-]+/i', $html, $matches);
|
|
foreach (($matches[0] ?? []) as $match) {
|
|
$urls[] = strtolower($match);
|
|
}
|
|
}
|
|
|
|
return array_values(array_unique($urls));
|
|
}
|
|
|
|
private function parseArticlePage(string $url): ?array
|
|
{
|
|
try {
|
|
$html = $this->fetch($url);
|
|
} catch (\Throwable) {
|
|
return null;
|
|
}
|
|
|
|
if (!preg_match('/<h1[^>]*>(.*?)<\/h1>/is', $html, $titleMatch)) {
|
|
return null;
|
|
}
|
|
|
|
$title = $this->sanitizeText($titleMatch[1]);
|
|
if ($title === '') {
|
|
return null;
|
|
}
|
|
|
|
if (!preg_match('/<div\s+class="main_1_column">\s*(<p.*?<\/p>)\s*<\/div>/is', $html, $contentMatch)) {
|
|
return null;
|
|
}
|
|
|
|
$contentRaw = $contentMatch[1];
|
|
$contentRaw = preg_replace('/<\s*br\s*\/?\s*>/i', "\n", $contentRaw) ?? $contentRaw;
|
|
$contentRaw = preg_replace('/<\/p>\s*<p[^>]*>/i', "\n\n", $contentRaw) ?? $contentRaw;
|
|
$content = $this->sanitizeText($contentRaw);
|
|
|
|
if ($content === '') {
|
|
return null;
|
|
}
|
|
|
|
$content = "Source: {$url}\n\n{$content}";
|
|
|
|
return [$title, Str::limit($content, 64000, '')];
|
|
}
|
|
|
|
private function sanitizeText(string $value): string
|
|
{
|
|
$decoded = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
$decoded = preg_replace('/\s+/', ' ', $decoded) ?? $decoded;
|
|
return trim($decoded);
|
|
}
|
|
}
|