|
12 | 12 | import mimetypes |
13 | 13 | from urllib.parse import urlparse |
14 | 14 |
|
| 15 | + |
15 | 16 | def get_extension_for_mimetype(mimetype): |
16 | 17 | """Get the most common file extension for a given MIME type.""" |
17 | 18 | ext = mimetypes.guess_extension(mimetype) |
18 | 19 | if ext: |
19 | 20 | return ext |
20 | | - |
| 21 | + |
21 | 22 | # Fallback mappings for common types |
22 | 23 | fallbacks = { |
23 | | - 'application/json': '.json', |
24 | | - 'image/svg+xml': '.svg', |
25 | | - 'text/html': '.html', |
26 | | - 'text/css': '.css', |
27 | | - 'application/javascript': '.js', |
| 24 | + "application/json": ".json", |
| 25 | + "image/svg+xml": ".svg", |
| 26 | + "text/html": ".html", |
| 27 | + "text/css": ".css", |
| 28 | + "application/javascript": ".js", |
28 | 29 | } |
29 | | - return fallbacks.get(mimetype, '.bin') |
| 30 | + return fallbacks.get(mimetype, ".bin") |
| 31 | + |
30 | 32 |
|
31 | 33 | def extract_path_from_url(url): |
32 | 34 | """Convert a URL into a filesystem path, preserving the path structure.""" |
33 | 35 | parsed = urlparse(url) |
34 | | - path = parsed.path.lstrip('/') |
35 | | - |
| 36 | + path = parsed.path.lstrip("/") |
| 37 | + |
36 | 38 | # Handle empty paths |
37 | 39 | if not path: |
38 | | - path = 'index' |
39 | | - |
| 40 | + path = "index" |
| 41 | + |
40 | 42 | # Remove trailing slashes |
41 | | - path = path.rstrip('/') |
42 | | - |
| 43 | + path = path.rstrip("/") |
| 44 | + |
43 | 45 | return path |
44 | 46 |
|
| 47 | + |
45 | 48 | @click.command() |
46 | | -@click.argument('harzip', type=click.Path(exists=True)) |
47 | | -@click.argument('mimetypes', nargs=-1, required=True) |
48 | | -@click.option('-o', '--output', type=click.Path(), default='.', |
49 | | - help='Output directory for extracted files') |
50 | | -@click.option('--paths', is_flag=True, |
51 | | - help='Use URL paths for filenames instead of original names') |
52 | | -@click.option('--pretty-json', is_flag=True, |
53 | | - help='Pretty print JSON files with 2-space indentation') |
| 49 | +@click.argument("harzip", type=click.Path(exists=True)) |
| 50 | +@click.argument("mimetypes", nargs=-1, required=True) |
| 51 | +@click.option( |
| 52 | + "-o", |
| 53 | + "--output", |
| 54 | + type=click.Path(), |
| 55 | + default=".", |
| 56 | + help="Output directory for extracted files", |
| 57 | +) |
| 58 | +@click.option( |
| 59 | + "--paths", |
| 60 | + is_flag=True, |
| 61 | + help="Use URL paths for filenames instead of original names", |
| 62 | +) |
| 63 | +@click.option( |
| 64 | + "--pretty-json", |
| 65 | + is_flag=True, |
| 66 | + help="Pretty print JSON files with 2-space indentation", |
| 67 | +) |
54 | 68 | def extract_har(harzip, mimetypes, output, paths, pretty_json): |
55 | 69 | """Extract files of specified MIME types from a HAR archive.""" |
56 | 70 | output_dir = Path(output) |
57 | 71 | output_dir.mkdir(parents=True, exist_ok=True) |
58 | | - |
| 72 | + |
59 | 73 | with zipfile.ZipFile(harzip) as zf: |
60 | 74 | # Read the HAR JSON file |
61 | 75 | try: |
62 | | - har_content = json.loads(zf.read('har.har')) |
| 76 | + har_content = json.loads(zf.read("har.har")) |
63 | 77 | except KeyError: |
64 | 78 | click.echo("Error: har.har not found in archive", err=True) |
65 | 79 | return |
66 | 80 | except json.JSONDecodeError: |
67 | 81 | click.echo("Error: Invalid JSON in har.har", err=True) |
68 | 82 | return |
69 | | - |
| 83 | + |
70 | 84 | # Process each entry |
71 | | - for entry in har_content.get('log', {}).get('entries', []): |
72 | | - response = entry.get('response', {}) |
73 | | - content = response.get('content', {}) |
74 | | - |
| 85 | + for entry in har_content.get("log", {}).get("entries", []): |
| 86 | + response = entry.get("response", {}) |
| 87 | + content = response.get("content", {}) |
| 88 | + |
75 | 89 | # Check if this entry matches our MIME type filter |
76 | | - if content.get('mimeType') not in mimetypes: |
| 90 | + if content.get("mimeType") not in mimetypes: |
77 | 91 | continue |
78 | | - |
| 92 | + |
79 | 93 | # Get the file reference and URL |
80 | | - file_ref = content.get('_file') |
| 94 | + file_ref = content.get("_file") |
81 | 95 | if not file_ref: |
82 | 96 | continue |
83 | | - |
84 | | - request_url = entry.get('request', {}).get('url', '') |
85 | | - |
| 97 | + |
| 98 | + request_url = entry.get("request", {}).get("url", "") |
| 99 | + |
86 | 100 | try: |
87 | 101 | # Extract the file |
88 | 102 | file_content = zf.read(file_ref) |
89 | | - |
| 103 | + |
90 | 104 | if paths: |
91 | 105 | # Use URL path for filename |
92 | 106 | path = extract_path_from_url(request_url) |
93 | 107 | # Add appropriate extension if not present |
94 | 108 | if not Path(path).suffix: |
95 | | - path += get_extension_for_mimetype(content['mimeType']) |
| 109 | + path += get_extension_for_mimetype(content["mimeType"]) |
96 | 110 | outpath = output_dir / path |
97 | 111 | else: |
98 | 112 | # Use original filename |
99 | 113 | outpath = output_dir / file_ref |
100 | | - |
| 114 | + |
101 | 115 | # Ensure parent directories exist |
102 | 116 | outpath.parent.mkdir(parents=True, exist_ok=True) |
103 | | - |
| 117 | + |
104 | 118 | # Handle JSON pretty printing if requested |
105 | | - if pretty_json and content['mimeType'] == 'application/json': |
| 119 | + if pretty_json and content["mimeType"] == "application/json": |
106 | 120 | try: |
107 | 121 | json_data = json.loads(file_content) |
108 | | - file_content = json.dumps(json_data, indent=2).encode('utf-8') |
| 122 | + file_content = json.dumps(json_data, indent=2).encode("utf-8") |
109 | 123 | except json.JSONDecodeError: |
110 | | - click.echo(f"Warning: Could not pretty print {outpath} - invalid JSON", err=True) |
111 | | - |
| 124 | + click.echo( |
| 125 | + f"Warning: Could not pretty print {outpath} - invalid JSON", |
| 126 | + err=True, |
| 127 | + ) |
| 128 | + |
112 | 129 | # Write the file |
113 | 130 | outpath.write_bytes(file_content) |
114 | 131 | click.echo(f"Extracted: {outpath}") |
115 | | - |
| 132 | + |
116 | 133 | except KeyError: |
117 | 134 | click.echo(f"Warning: File {file_ref} not found in archive", err=True) |
118 | 135 | continue |
119 | 136 |
|
120 | | -if __name__ == '__main__': |
| 137 | + |
| 138 | +if __name__ == "__main__": |
121 | 139 | extract_har() |
0 commit comments