|
1 | 1 | import sys |
2 | 2 | import requests |
3 | | -import json |
4 | | -import openai |
| 3 | +import os |
5 | 4 |
|
6 | 5 |
|
7 | | -def scrape_url(url): |
| 6 | +def is_valid_markdown(content): |
| 7 | + # Basic check for valid Markdown (customize as needed) |
| 8 | + return bool(content.strip()) and content != 'Not Found' |
| 9 | + |
| 10 | + |
| 11 | +def fetch_readme(url): |
| 12 | + # Convert GitHub blob URL to raw URL and append /README.md |
| 13 | + if 'blob' in url: |
| 14 | + raw_url = url.replace('blob', 'raw') + '/README.md' |
| 15 | + else: |
| 16 | + raw_url = url + '/raw/main/README.md' |
| 17 | + |
8 | 18 | try: |
9 | | - response = requests.get(url, timeout=10) |
| 19 | + # Try fetching from the raw URL |
| 20 | + print(f"Fetching README from: {raw_url}") |
| 21 | + response = requests.get(raw_url, timeout=10) |
10 | 22 | response.raise_for_status() |
11 | | - content = response.json() |
12 | | - |
13 | | - # for testing |
14 | | - with open('scraped_content.txt', 'w', encoding='utf-8') as f: |
15 | | - f.write(content) |
16 | | - print(f"Scraped content from {url}") |
17 | | - except Exception as e: |
18 | | - print(f"Error scraping {url}: {e}") |
| 23 | + |
| 24 | + # Check if content is valid Markdown |
| 25 | + if not is_valid_markdown(response.text): |
| 26 | + # Fallback to /master/ if /main/ fails |
| 27 | + raw_url = raw_url.replace('/main/', '/master/') |
| 28 | + print(f"Retrying with: {raw_url}") |
| 29 | + response = requests.get(raw_url, timeout=10) |
| 30 | + response.raise_for_status() |
| 31 | + |
| 32 | + if not is_valid_markdown(response.text): |
| 33 | + print( |
| 34 | + f"Error: Fetched content from {raw_url} is not valid Markdown") |
| 35 | + sys.exit(1) |
| 36 | + |
| 37 | + # Ensure the output directory exists |
| 38 | + # Change to desired folder (e.g., 'downloads' or '') |
| 39 | + output_dir = 'local' |
| 40 | + if output_dir and not os.path.exists(output_dir): |
| 41 | + os.makedirs(output_dir) |
| 42 | + |
| 43 | + # Save the README content in the local folder |
| 44 | + output_file = os.path.join( |
| 45 | + output_dir, 'readme.md') if output_dir else 'readme.md' |
| 46 | + with open(output_file, 'w', encoding='utf-8') as f: |
| 47 | + f.write(response.text) |
| 48 | + print(f"Saved README to {output_file}") |
| 49 | + |
| 50 | + except requests.RequestException as e: |
| 51 | + print(f"Error fetching README from {raw_url}: {e}") |
19 | 52 | sys.exit(1) |
20 | 53 |
|
21 | 54 |
|
22 | 55 | if __name__ == "__main__": |
23 | | - issue_body = sys.argv[1] |
24 | | - # Extract URL from issue body (assumes link is on its own line) |
25 | | - # TODO: need a more robust way to extract the URL |
26 | | - for line in issue_body.split('\n'): |
27 | | - line = line.strip() |
28 | | - if line.startswith('http://') or line.startswith('https://'): |
29 | | - scrape_url(line) |
30 | | - break |
31 | | - else: |
32 | | - print("No valid URL found in issue body") |
| 56 | + if len(sys.argv) < 2: |
| 57 | + print("No URL provided") |
33 | 58 | sys.exit(1) |
| 59 | + |
| 60 | + url = sys.argv[1].strip() |
| 61 | + # Basic URL validation |
| 62 | + if not (url.startswith('http://') or url.startswith('https://')): |
| 63 | + print("Invalid URL provided") |
| 64 | + sys.exit(1) |
| 65 | + |
| 66 | + fetch_readme(url) |
0 commit comments