fetch($baseUrl); $categories = $this->extractCategories($rootHtml); $categoryMap = $this->syncCategories($categories, $dryRun); $sections = $this->buildSections($baseUrl, $categories); $articleUrlMap = $this->collectArticleUrls($baseUrl, $rootHtml, $sections); if ($limit !== null && $limit > 0) { $articleUrlMap = array_slice($articleUrlMap, 0, $limit, true); } $total = count($articleUrlMap); $imported = 0; $updated = 0; $skipped = 0; $processed = 0; foreach ($articleUrlMap as $articleUrl => $meta) { $processed++; $parsed = $this->parseArticlePage($articleUrl); if ($parsed === null) { $skipped++; $progress && $progress($processed, $total, $articleUrl, 'skipped'); continue; } if ($dryRun) { $imported++; $progress && $progress($processed, $total, $articleUrl, 'dry-run'); continue; } [$title, $content, $sourceArticleId] = $parsed; $categoryId = $this->resolveCategoryId($meta['category_external_id'] ?? null, $categoryMap); $subcategoryId = $this->resolveCategoryId($meta['subcategory_external_id'] ?? null, $categoryMap); $result = Article::query()->updateOrCreate( ['source' => 'internettoday_helpdesk', 'source_article_id' => $sourceArticleId], [ 'title' => $title, 'content' => $content, 'source_url' => $articleUrl, 'category_id' => $categoryId, 'subcategory_id' => $subcategoryId, ] ); if ($result->wasRecentlyCreated) { $imported++; $progress && $progress($processed, $total, $articleUrl, 'imported'); } else { $updated++; $progress && $progress($processed, $total, $articleUrl, 'updated'); } } return [ 'categories' => count($categories), 'sections' => count($sections), 'article_urls' => $total, 'imported' => $imported, 'updated' => $updated, 'skipped' => $skipped, 'dry_run' => $dryRun, ]; } private function fetch(string $url): string { return Http::timeout(30)->retry(2, 300)->get($url)->throw()->body(); } private function extractCategories(string $html): array { if (! preg_match('/const\s+categories\s*=\s*(\[.*?\]);/s', $html, $matches)) { return []; } $decoded = json_decode($matches[1], true); return is_array($decoded) ? $decoded : []; } private function syncCategories(array $categories, bool $dryRun): array { $map = []; foreach ($categories as $category) { if (! isset($category['id'], $category['title'], $category['slug'])) { continue; } $parentId = null; if (! $dryRun) { $model = Category::query()->updateOrCreate( ['external_id' => (int) $category['id']], ['name' => (string) $category['title'], 'slug' => (string) $category['slug'], 'parent_id' => null] ); $parentId = $model->id; } $map[(int) $category['id']] = $parentId; foreach (($category['children'] ?? []) as $child) { if (! isset($child['id'], $child['title'], $child['slug'])) { continue; } if (! $dryRun && $parentId !== null) { $childModel = Category::query()->updateOrCreate( ['external_id' => (int) $child['id']], ['name' => (string) $child['title'], 'slug' => (string) $child['slug'], 'parent_id' => $parentId] ); $map[(int) $child['id']] = $childModel->id; } else { $map[(int) $child['id']] = null; } } } return $map; } private function buildSections(string $baseUrl, array $categories): array { $sections = []; foreach ($categories as $category) { if (! isset($category['id'], $category['slug'])) { continue; } $sections[] = [ 'url' => sprintf('%s/%d/%s', $baseUrl, (int) $category['id'], (string) $category['slug']), 'category_external_id' => (int) $category['id'], 'subcategory_external_id' => null, ]; foreach (($category['children'] ?? []) as $child) { if (! isset($child['id'], $child['slug'])) { continue; } $sections[] = [ 'url' => sprintf('%s/%d/%s', $baseUrl, (int) $child['id'], (string) $child['slug']), 'category_external_id' => (int) $category['id'], 'subcategory_external_id' => (int) $child['id'], ]; } } return $sections; } private function collectArticleUrls(string $baseUrl, string $rootHtml, array $sections): array { $result = []; $sources = array_merge([ ['url' => $baseUrl, 'category_external_id' => null, 'subcategory_external_id' => null, 'html' => $rootHtml], ], $sections); foreach ($sources as $source) { try { $html = $source['html'] ?? $this->fetch($source['url']); } catch (\Throwable) { continue; } preg_match_all('/https:\/\/www\.internettoday\.nl\/helpdesk\/(\d+)-[a-z0-9\-]+/i', $html, $matches); foreach (($matches[0] ?? []) as $match) { $url = strtolower($match); if (! isset($result[$url])) { $result[$url] = [ 'category_external_id' => $source['category_external_id'], 'subcategory_external_id' => $source['subcategory_external_id'], ]; } } } return $result; } private function parseArticlePage(string $url): ?array { try { $html = $this->fetch($url); } catch (\Throwable) { return null; } if (! preg_match('/]*>(.*?)<\/h1>/is', $html, $titleMatch)) { return null; } $title = $this->sanitizeText($titleMatch[1]); if ($title === '') { return null; } if (! preg_match('/\s*()\s*<\/div>/is', $html, $contentMatch)) { return null; } $contentRaw = $contentMatch[1]; $contentRaw = preg_replace('/<\s*br\s*\/?\s*>/i', "\n", $contentRaw) ?? $contentRaw; $contentRaw = preg_replace('/<\/p>\s*]*>/i', "\n\n", $contentRaw) ?? $contentRaw; $content = $this->sanitizeText($contentRaw); if ($content === '') { return null; } if (! preg_match('/\/helpdesk\/(\d+)-/', $url, $idMatch)) { return null; } return [$title, Str::limit($content, 64000, ''), (int) $idMatch[1]]; } private function resolveCategoryId(?int $externalId, array $map): ?int { if ($externalId === null) { return null; } return $map[$externalId] ?? Category::query()->where('external_id', $externalId)->value('id'); } private function sanitizeText(string $value): string { $decoded = html_entity_decode(strip_tags($value), ENT_QUOTES | ENT_HTML5, 'UTF-8'); $decoded = preg_replace('/\s+/', ' ', $decoded) ?? $decoded; return trim($decoded); } }