Skip to content

Commit d0fb383

Browse files
feat(seo): add page scraping and content analysis tools
1 parent 7f2c882 commit d0fb383

File tree

19 files changed

+793
-0
lines changed

19 files changed

+793
-0
lines changed

seo-tool/.gitignore

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2+
3+
# dependencies
4+
/node_modules
5+
/.pnp
6+
.pnp.*
7+
.yarn/*
8+
!.yarn/patches
9+
!.yarn/plugins
10+
!.yarn/releases
11+
!.yarn/versions
12+
13+
# testing
14+
/coverage
15+
16+
# next.js
17+
/.next/
18+
/out/
19+
20+
# production
21+
/build
22+
23+
# misc
24+
.DS_Store
25+
*.pem
26+
27+
# debug
28+
npm-debug.log*
29+
yarn-debug.log*
30+
yarn-error.log*
31+
.pnpm-debug.log*
32+
33+
# env files (can opt-in for committing if needed)
34+
.env*
35+
36+
# vercel
37+
.vercel
38+
39+
# typescript
40+
*.tsbuildinfo
41+
next-env.d.ts

seo-tool/README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app).
2+
3+
## Getting Started
4+
5+
First, run the development server:
6+
7+
```bash
8+
npm run dev
9+
# or
10+
yarn dev
11+
# or
12+
pnpm dev
13+
# or
14+
bun dev
15+
```
16+
17+
Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
18+
19+
You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
20+
21+
This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
22+
23+
## Learn More
24+
25+
To learn more about Next.js, take a look at the following resources:
26+
27+
- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
28+
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
29+
30+
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
31+
32+
## Deploy on Vercel
33+
34+
The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
35+
36+
Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import { NextRequest, NextResponse } from 'next/server';
2+
import axios from 'axios';
3+
import * as cheerio from 'cheerio';
4+
import { removeStopwords } from 'stopword';
5+
6+
export async function POST(request: NextRequest) {
7+
try {
8+
const { url } = await request.json();
9+
10+
if (!url) {
11+
return NextResponse.json(
12+
{ error: 'URL is required' },
13+
{ status: 400 }
14+
);
15+
}
16+
17+
// Validate URL format
18+
try {
19+
new URL(url);
20+
} catch {
21+
return NextResponse.json(
22+
{ error: 'Invalid URL format' },
23+
{ status: 400 }
24+
);
25+
}
26+
27+
// Fetch the webpage
28+
const response = await axios.get(url, {
29+
headers: {
30+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
31+
},
32+
timeout: 10000,
33+
});
34+
35+
const html = response.data;
36+
const $ = cheerio.load(html);
37+
38+
// Analyze heading structure
39+
const headingStructure = [] as Array<{ level: string; text: string; order: number }>;
40+
let order = 0;
41+
42+
$('h1, h2, h3, h4, h5, h6').each((_, el) => {
43+
const tagName = el.tagName.toLowerCase();
44+
const text = $(el).text().trim();
45+
if (text) {
46+
headingStructure.push({
47+
level: tagName,
48+
text,
49+
order: order++,
50+
});
51+
}
52+
});
53+
54+
// Count headings by level
55+
const headingCounts = {
56+
h1: $('h1').length,
57+
h2: $('h2').length,
58+
h3: $('h3').length,
59+
h4: $('h4').length,
60+
h5: $('h5').length,
61+
h6: $('h6').length,
62+
};
63+
64+
// Extract and analyze body text
65+
$('script, style, nav, footer, header, aside').remove();
66+
const bodyText = $('body').text().replace(/\s+/g, ' ').trim();
67+
68+
// Word count
69+
const words = bodyText.split(/\s+/).filter(word => word.length > 0);
70+
const wordCount = words.length;
71+
72+
// Character count
73+
const characterCount = bodyText.length;
74+
const characterCountNoSpaces = bodyText.replace(/\s/g, '').length;
75+
76+
// Sentence count (approximate)
77+
const sentences = bodyText.split(/[.!?]+/).filter(s => s.trim().length > 0);
78+
const sentenceCount = sentences.length;
79+
80+
// Average words per sentence
81+
const avgWordsPerSentence = sentenceCount > 0 ? Math.round(wordCount / sentenceCount) : 0;
82+
83+
// Extract keywords (most common words, excluding stopwords)
84+
const cleanWords = words
85+
.map(word => word.toLowerCase().replace(/[^a-z0-9]/g, ''))
86+
.filter(word => word.length > 2);
87+
88+
const wordsWithoutStopwords = removeStopwords(cleanWords);
89+
90+
// Count word frequency
91+
const wordFrequency: Record<string, number> = {};
92+
wordsWithoutStopwords.forEach(word => {
93+
wordFrequency[word] = (wordFrequency[word] || 0) + 1;
94+
});
95+
96+
// Get top keywords
97+
const topKeywords = Object.entries(wordFrequency)
98+
.sort((a, b) => b[1] - a[1])
99+
.slice(0, 20)
100+
.map(([word, count]) => ({
101+
word,
102+
count,
103+
density: ((count / wordCount) * 100).toFixed(2) + '%',
104+
}));
105+
106+
// Analyze paragraphs
107+
const paragraphs = $('p').length;
108+
const avgWordsPerParagraph = paragraphs > 0 ? Math.round(wordCount / paragraphs) : 0;
109+
110+
// Analyze images
111+
const totalImages = $('img').length;
112+
const imagesWithAlt = $('img[alt]').filter((_, el) => $(el).attr('alt')?.trim()).length;
113+
const imagesWithoutAlt = totalImages - imagesWithAlt;
114+
115+
// Analyze links
116+
const totalLinks = $('a[href]').length;
117+
const baseUrl = new URL(url);
118+
let internalLinks = 0;
119+
let externalLinks = 0;
120+
121+
$('a[href]').each((_, el) => {
122+
const href = $(el).attr('href');
123+
if (href) {
124+
try {
125+
const linkUrl = new URL(href, url);
126+
if (linkUrl.hostname === baseUrl.hostname) {
127+
internalLinks++;
128+
} else {
129+
externalLinks++;
130+
}
131+
} catch {
132+
// Invalid URL
133+
}
134+
}
135+
});
136+
137+
// Reading time estimate (average 200 words per minute)
138+
const readingTimeMinutes = Math.ceil(wordCount / 200);
139+
140+
// SEO recommendations
141+
const recommendations = [];
142+
if (headingCounts.h1 === 0) {
143+
recommendations.push('Add an H1 heading to your page');
144+
} else if (headingCounts.h1 > 1) {
145+
recommendations.push('Consider using only one H1 heading per page');
146+
}
147+
if (wordCount < 300) {
148+
recommendations.push('Content is quite short. Consider adding more content (aim for 300+ words)');
149+
}
150+
if (imagesWithoutAlt > 0) {
151+
recommendations.push(`${imagesWithoutAlt} image(s) missing alt text. Add alt text for better SEO and accessibility`);
152+
}
153+
if (topKeywords.length === 0) {
154+
recommendations.push('No significant keywords found. Add more relevant content');
155+
}
156+
157+
return NextResponse.json({
158+
url,
159+
headingStructure,
160+
headingCounts,
161+
content: {
162+
wordCount,
163+
characterCount,
164+
characterCountNoSpaces,
165+
sentenceCount,
166+
paragraphCount: paragraphs,
167+
avgWordsPerSentence,
168+
avgWordsPerParagraph,
169+
readingTimeMinutes,
170+
},
171+
keywords: topKeywords,
172+
images: {
173+
total: totalImages,
174+
withAlt: imagesWithAlt,
175+
withoutAlt: imagesWithoutAlt,
176+
},
177+
links: {
178+
total: totalLinks,
179+
internal: internalLinks,
180+
external: externalLinks,
181+
},
182+
recommendations,
183+
});
184+
} catch (error: any) {
185+
console.error('Content analysis error:', error);
186+
return NextResponse.json(
187+
{ error: error.message || 'Failed to analyze content' },
188+
{ status: 500 }
189+
);
190+
}
191+
}

0 commit comments

Comments
 (0)