126 lines
3.8 KiB
TypeScript
126 lines
3.8 KiB
TypeScript
import { NextRequest, NextResponse } from 'next/server';
|
|
|
|
// Function to sanitize smart quotes and other problematic characters
|
|
function sanitizeQuotes(text: string): string {
|
|
return text
|
|
// Replace smart single quotes
|
|
.replace(/[\u2018\u2019\u201A\u201B]/g, "'")
|
|
// Replace smart double quotes
|
|
.replace(/[\u201C\u201D\u201E\u201F]/g, '"')
|
|
// Replace other quote-like characters
|
|
.replace(/[\u00AB\u00BB]/g, '"') // Guillemets
|
|
.replace(/[\u2039\u203A]/g, "'") // Single guillemets
|
|
// Replace other problematic characters
|
|
.replace(/[\u2013\u2014]/g, '-') // En dash and em dash
|
|
.replace(/[\u2026]/g, '...') // Ellipsis
|
|
.replace(/[\u00A0]/g, ' '); // Non-breaking space
|
|
}
|
|
|
|
export async function POST(request: NextRequest) {
|
|
try {
|
|
const { url } = await request.json();
|
|
|
|
if (!url) {
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: 'URL is required'
|
|
}, { status: 400 });
|
|
}
|
|
|
|
console.log('[scrape-url-enhanced] Scraping with Firecrawl:', url);
|
|
|
|
const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY;
|
|
if (!FIRECRAWL_API_KEY) {
|
|
throw new Error('FIRECRAWL_API_KEY environment variable is not set');
|
|
}
|
|
|
|
// Make request to Firecrawl API with maxAge for 500% faster scraping
|
|
const firecrawlResponse = await fetch('https://api.firecrawl.dev/v1/scrape', {
|
|
method: 'POST',
|
|
headers: {
|
|
'Authorization': `Bearer ${FIRECRAWL_API_KEY}`,
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify({
|
|
url,
|
|
formats: ['markdown', 'html', 'screenshot'],
|
|
waitFor: 3000,
|
|
timeout: 30000,
|
|
blockAds: true,
|
|
maxAge: 3600000, // Use cached data if less than 1 hour old (500% faster!)
|
|
actions: [
|
|
{
|
|
type: 'wait',
|
|
milliseconds: 2000
|
|
},
|
|
{
|
|
type: 'screenshot',
|
|
fullPage: false // Just visible viewport for performance
|
|
}
|
|
]
|
|
})
|
|
});
|
|
|
|
if (!firecrawlResponse.ok) {
|
|
const error = await firecrawlResponse.text();
|
|
throw new Error(`Firecrawl API error: ${error}`);
|
|
}
|
|
|
|
const data = await firecrawlResponse.json();
|
|
|
|
if (!data.success || !data.data) {
|
|
throw new Error('Failed to scrape content');
|
|
}
|
|
|
|
const { markdown, html, metadata, screenshot, actions } = data.data;
|
|
|
|
// Get screenshot from either direct field or actions result
|
|
const screenshotUrl = screenshot || actions?.screenshots?.[0] || null;
|
|
|
|
// Sanitize the markdown content
|
|
const sanitizedMarkdown = sanitizeQuotes(markdown || '');
|
|
|
|
// Extract structured data from the response
|
|
const title = metadata?.title || '';
|
|
const description = metadata?.description || '';
|
|
|
|
// Format content for AI
|
|
const formattedContent = `
|
|
Title: ${sanitizeQuotes(title)}
|
|
Description: ${sanitizeQuotes(description)}
|
|
URL: ${url}
|
|
|
|
Main Content:
|
|
${sanitizedMarkdown}
|
|
`.trim();
|
|
|
|
return NextResponse.json({
|
|
success: true,
|
|
url,
|
|
content: formattedContent,
|
|
screenshot: screenshotUrl,
|
|
structured: {
|
|
title: sanitizeQuotes(title),
|
|
description: sanitizeQuotes(description),
|
|
content: sanitizedMarkdown,
|
|
url,
|
|
screenshot: screenshotUrl
|
|
},
|
|
metadata: {
|
|
scraper: 'firecrawl-enhanced',
|
|
timestamp: new Date().toISOString(),
|
|
contentLength: formattedContent.length,
|
|
cached: data.data.cached || false, // Indicates if data came from cache
|
|
...metadata
|
|
},
|
|
message: 'URL scraped successfully with Firecrawl (with caching for 500% faster performance)'
|
|
});
|
|
|
|
} catch (error) {
|
|
console.error('[scrape-url-enhanced] Error:', error);
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: (error as Error).message
|
|
}, { status: 500 });
|
|
}
|
|
} |