1
1
import { Octokit } from "@octokit/rest" ;
2
2
import cheerio from "cheerio" ;
3
3
import fetch from "node-fetch" ;
4
- import { URL } from "url" ;
4
+ import { URL } from "node: url" ;
5
5
6
6
const IGNORE_PATHS_ENDING_IN = [
7
7
"favicon.ico" ,
@@ -18,24 +18,33 @@ const IGNORE_PATHS_ENDING_IN = [
18
18
19
19
const GITHUB_PATHS_TO_TRAVERSE = [ "/blob/" , "/tree/" ] ;
20
20
21
+ async function getDefaultBranch ( owner : string , repo : string ) : Promise < string > {
22
+ const octokit = new Octokit ( { auth : undefined } ) ;
23
+
24
+ const repoInfo = await octokit . repos . get ( {
25
+ owner,
26
+ repo,
27
+ } ) ;
28
+
29
+ return repoInfo . data . default_branch ;
30
+ }
31
+
21
32
async function crawlGithubRepo ( baseUrl : URL ) {
22
33
const octokit = new Octokit ( {
23
34
auth : undefined ,
24
35
} ) ;
25
36
26
37
const [ _ , owner , repo ] = baseUrl . pathname . split ( "/" ) ;
27
38
28
- let dirContentsConfig = {
29
- owner : owner ,
30
- repo : repo ,
31
- } ;
39
+ const branch = await getDefaultBranch ( owner , repo ) ;
40
+ console . log ( "Github repo detected. Crawling" , branch , "branch" ) ;
32
41
33
42
const tree = await octokit . request (
34
43
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}" ,
35
44
{
36
45
owner,
37
46
repo,
38
- tree_sha : "main" ,
47
+ tree_sha : branch ,
39
48
headers : {
40
49
"X-GitHub-Api-Version" : "2022-11-28" ,
41
50
} ,
@@ -44,8 +53,8 @@ async function crawlGithubRepo(baseUrl: URL) {
44
53
) ;
45
54
46
55
const paths = tree . data . tree
47
- . filter ( ( file ) => file . type === "blob" && file . path ?. endsWith ( ".md" ) )
48
- . map ( ( file ) => baseUrl . pathname + "/tree/main/" + file . path ) ;
56
+ . filter ( ( file : any ) => file . type === "blob" && file . path ?. endsWith ( ".md" ) )
57
+ . map ( ( file : any ) => baseUrl . pathname + "/tree/main/" + file . path ) ;
49
58
50
59
return paths ;
51
60
}
@@ -54,6 +63,7 @@ async function getLinksFromUrl(url: string, path: string) {
54
63
const baseUrl = new URL ( url ) ;
55
64
const location = new URL ( path , url ) ;
56
65
let response ;
66
+
57
67
try {
58
68
response = await fetch ( location . toString ( ) ) ;
59
69
} catch ( error : unknown ) {
@@ -63,13 +73,12 @@ async function getLinksFromUrl(url: string, path: string) {
63
73
html : "" ,
64
74
links : [ ] ,
65
75
} ;
66
- } else {
67
- console . error ( error ) ;
68
- return {
69
- html : "" ,
70
- links : [ ] ,
71
- } ;
72
76
}
77
+ console . error ( error ) ;
78
+ return {
79
+ html : "" ,
80
+ links : [ ] ,
81
+ } ;
73
82
}
74
83
75
84
const html = await response . text ( ) ;
@@ -113,7 +122,9 @@ async function getLinksFromUrl(url: string, path: string) {
113
122
}
114
123
115
124
function splitUrl ( url : URL ) {
116
- const baseUrl = `${ url . protocol } //${ url . hostname } ` ;
125
+ const baseUrl = `${ url . protocol } //${ url . hostname } ${
126
+ url . port ? ":" + url . port : ""
127
+ } `;
117
128
const basePath = url . pathname ;
118
129
return {
119
130
baseUrl,
@@ -127,46 +138,69 @@ export type PageData = {
127
138
html : string ;
128
139
} ;
129
140
130
- export async function * crawlPage ( url : URL ) : AsyncGenerator < PageData > {
141
+ export async function * crawlPage (
142
+ url : URL ,
143
+ maxDepth : number = 3 ,
144
+ ) : AsyncGenerator < PageData > {
145
+ console . log ( "Starting crawl from: " , url , " - Max Depth: " , maxDepth ) ;
131
146
const { baseUrl, basePath } = splitUrl ( url ) ;
132
- let paths : string [ ] = [ basePath ] ;
147
+ let paths : { path : string ; depth : number } [ ] = [ { path : basePath , depth : 0 } ] ;
133
148
134
149
if ( url . hostname === "github.com" ) {
135
150
const githubLinks = await crawlGithubRepo ( url ) ;
136
- paths = [ ...paths , ...githubLinks ] ;
151
+ const githubLinkObjects = githubLinks . map ( ( link ) => ( {
152
+ path : link ,
153
+ depth : 0 ,
154
+ } ) ) ;
155
+ paths = [ ...paths , ...githubLinkObjects ] ;
137
156
}
138
157
139
158
let index = 0 ;
140
-
141
159
while ( index < paths . length ) {
142
- const promises = paths
143
- . slice ( index , index + 50 )
144
- . map ( ( path ) => getLinksFromUrl ( baseUrl , path ) ) ;
145
-
146
- const results = await Promise . all ( promises ) ;
147
-
148
- for ( const { html, links } of results ) {
149
- if ( html !== "" ) {
150
- yield {
151
- url : url . toString ( ) ,
152
- path : paths [ index ] ,
153
- html : html ,
154
- } ;
155
- }
160
+ const batch = paths . slice ( index , index + 50 ) ;
161
+
162
+ try {
163
+ const promises = batch . map ( ( { path, depth } ) =>
164
+ getLinksFromUrl ( baseUrl , path ) . then ( ( links ) => ( {
165
+ links,
166
+ path,
167
+ depth,
168
+ } ) ) ,
169
+ ) ; // Adjust for depth tracking
170
+
171
+ const results = await Promise . all ( promises ) ;
172
+ for ( const {
173
+ links : { html, links : linksArray } ,
174
+ path,
175
+ depth,
176
+ } of results ) {
177
+ if ( html !== "" && depth <= maxDepth ) {
178
+ // Check depth
179
+ yield {
180
+ url : url . toString ( ) ,
181
+ path,
182
+ html,
183
+ } ;
184
+ }
156
185
157
- for ( let link of links ) {
158
- if ( ! paths . includes ( link ) ) {
159
- paths . push ( link ) ;
186
+ // Ensure we only add links if within depth limit
187
+ if ( depth < maxDepth ) {
188
+ for ( let link of linksArray ) {
189
+ if ( ! paths . some ( ( p ) => p . path === link ) ) {
190
+ paths . push ( { path : link , depth : depth + 1 } ) ; // Increment depth for new paths
191
+ }
192
+ }
160
193
}
161
194
}
162
-
163
- index ++ ;
195
+ } catch ( e ) {
196
+ if ( e instanceof TypeError ) {
197
+ console . warn ( "Error while crawling page: " , e ) ; // Likely an invalid url, continue with process
198
+ } else {
199
+ console . error ( "Error while crawling page: " , e ) ;
200
+ }
164
201
}
165
202
166
- paths = paths . filter ( ( path ) =>
167
- results . some (
168
- ( result ) => result . html !== "" && result . links . includes ( path ) ,
169
- ) ,
170
- ) ;
203
+ index += batch . length ; // Proceed to next batch
171
204
}
172
- }
205
+ console . log ( "Crawl completed" ) ;
206
+ }
0 commit comments