Skip to content

Commit d713d1c

Browse files
authored
Merge pull request #36 from Umar-Azam/main
Added option for simple containerized execution
2 parents ae253fd + 84d19e3 commit d713d1c

File tree

6 files changed

+110
-0
lines changed

6 files changed

+110
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ Use this option for API access to your generated knowledge that you can integrat
116116

117117
![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)
118118

119+
## (Alternate method) Running in a container with Docker
120+
To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
119121

120122

121123
## Contributing

containerapp/Dockerfile

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
FROM ubuntu:jammy
2+
3+
# Install Git
4+
RUN apt-get update && \
5+
apt-get install sudo -y && \
6+
apt-get install git -y
7+
8+
# Install Docker
9+
RUN apt-get install ca-certificates curl gnupg -y && \
10+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
11+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
12+
apt-get update && \
13+
apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
14+
15+
# Install Nodejs v20 npm
16+
RUN sudo apt-get update && \
17+
sudo apt-get install -y ca-certificates curl gnupg && \
18+
sudo mkdir -p /etc/apt/keyrings && \
19+
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg
20+
21+
RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list && \
22+
sudo apt-get update && \
23+
sudo apt-get install nodejs -y
24+
25+
# Install gpt-crawler
26+
RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-crawler && \
27+
npm i && \
28+
npx playwright install && \
29+
npx playwright install-deps
30+
31+
# Directory to mount in the docker container to get the output.json data
32+
RUN cd /home && mkdir data
33+
34+
35+
WORKDIR /home

containerapp/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Containerized crawler
2+
## Docker image with packaged crawler, with script for building and execution.
3+
4+
5+
All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
6+
7+
8+
## Get started
9+
10+
### Prerequisites
11+
12+
Be sure you have docker installed
13+
14+
1. ``` cd gpt-crawler/containerapp ```
15+
2. ``` . ./run.sh ```

containerapp/data/config.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { Page } from "playwright";
2+
3+
type Config = {
4+
/** URL to start the crawl */
5+
url: string;
6+
/** Pattern to match against for links on a page to subsequently crawl */
7+
match: string;
8+
/** Selector to grab the inner text from */
9+
selector: string;
10+
/** Don't crawl more than this many pages */
11+
maxPagesToCrawl: number;
12+
/** File name for the finished data */
13+
outputFileName: string;
14+
/** Optional cookie to be set. E.g. for Cookie Consent */
15+
cookie?: {name: string; value: string}
16+
/** Optional function to run for each page found */
17+
onVisitPage?: (options: {
18+
page: Page;
19+
pushData: (data: any) => Promise<void>;
20+
}) => Promise<void>;
21+
/** Optional timeout for waiting for a selector to appear */
22+
waitForSelectorTimeout?: number;
23+
};
24+
25+
export const config: Config = {
26+
url: "https://www.builder.io/c/docs/developers",
27+
match: "https://www.builder.io/c/docs/**",
28+
selector: `.docs-builder-container`,
29+
maxPagesToCrawl: 50,
30+
outputFileName: "../data/output.json",
31+
};

containerapp/data/init.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/bin/bash
2+
3+
# copy the config when starting the container
4+
cp /home/data/config.ts /home/gpt-crawler/
5+
6+
# start the crawler
7+
cd /home/gpt-crawler && npm start
8+
9+
# Print message after crawling and exit
10+
echo "Crawling complete.."
11+
exit

containerapp/run.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
# Check if there is a Docker image named "crawler"
4+
if ! sudo docker images | grep -w 'crawler' > /dev/null; then
5+
echo "Docker repository 'crawler' not found. Building the image..."
6+
# Build the Docker image with the name 'crawler'
7+
sudo docker build -t crawler .
8+
else
9+
echo "Docker image already built."
10+
fi
11+
12+
# Ensure that init.sh script is executable
13+
sudo chmod +x ./data/init.sh
14+
15+
# Starting docker, mount docker.sock to work with docker-in-docker function, mount data directory for input/output from container
16+
sudo docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v ./data:/home/data crawler bash -c "/home/data/init.sh"

0 commit comments

Comments
 (0)