-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhar_extractor.py
More file actions
130 lines (105 loc) · 4.66 KB
/
har_extractor.py
File metadata and controls
130 lines (105 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
HAR Content Extractor
A command-line tool to extract and save all files, API responses, and web assets
from a HAR (.har) archive.
"""
import json
import base64
import os
import re
import argparse
from urllib.parse import urlparse
import binascii # Required for specific exception handling
def main(har_filepath: str, output_dir: str) -> None:
"""
Parses a HAR file to extract and save all HTTP response bodies.
Args:
har_filepath (str): The path to the input .har file.
output_dir (str): The directory where extracted files will be saved.
"""
# Ensure the output directory exists, creating it if necessary.
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")
# Attempt to read and parse the HAR file.
try:
with open(har_filepath, 'r', encoding='utf-8') as f:
har_data = json.load(f)
except FileNotFoundError:
print(f"Error: The file '{har_filepath}' was not found.")
return
except json.JSONDecodeError:
print(f"Error: '{har_filepath}' is not a valid JSON file.")
return
except Exception as e:
print(f"An unexpected error occurred while reading the file: {e}")
return
# Iterate through each HTTP request-response pair in the HAR log.
for idx, entry in enumerate(har_data['log']['entries']):
# --- Step 1: Decode the response content ---
# Skip entries that have no response content, e.g., 304 Not Modified.
if 'content' not in entry['response'] or 'text' not in entry['response']['content']:
continue
url = entry['request']['url']
content = entry['response']['content']
encoding = content.get('encoding')
data: bytes
if encoding == 'base64':
# Handle Base64 encoded content, typically for binary files like images.
try:
data = base64.b64decode(content['text'])
except (ValueError, binascii.Error) as e:
print(f"Warning: Could not decode Base64 content from {url}. Skipping. Error: {e}")
continue
else:
# Assume plain text and encode to UTF-8 bytes.
data = content['text'].encode('utf-8')
# --- Step 2: Construct a safe and descriptive filename from the URL ---
parsed_url = urlparse(url)
# Start with the last part of the URL path (e.g., 'script.js' from '/js/script.js').
filename = os.path.basename(parsed_url.path)
# If the path is empty (e.g., 'https://example.com/'), use the netloc.
if not filename:
filename = parsed_url.netloc
# Append the query string to the filename to preserve uniqueness for API calls.
if parsed_url.query:
# Sanitize the query string to be filesystem-friendly.
safe_query = re.sub(r'[\\/*?:"<>|]', '_', parsed_url.query)
filename = f"{filename}?{safe_query}"
# A final, aggressive sanitization to remove any remaining illegal characters.
# This is a critical step for ensuring compatibility with all major OS.
filename = re.sub(r'[\\/*?:"<>|]', '', filename)
# If, after all processing, the filename is empty, create a default name.
if not filename:
filename = f"default_filename_{idx}"
# --- Step 3: Write the decoded content to a file ---
save_path = os.path.join(output_dir, filename)
try:
with open(save_path, 'wb') as f:
f.write(data)
print(f"Saved: {save_path}")
except OSError as e:
# Handle filesystem errors, e.g., filename too long or permission denied.
print(f"Error: Could not write file {save_path}. OS Error: {e}")
except Exception as e:
print(f"An unknown error occurred while saving {save_path}: {e}")
if __name__ == "__main__":
# This block sets up the command-line interface for the script.
parser = argparse.ArgumentParser(
description="Extracts and saves all HTTP response bodies from a HAR file.",
epilog="Example: python har_extractor.py my_archive.har -o extracted_files"
)
parser.add_argument(
"har_file",
help="The path to the input .har file."
)
parser.add_argument(
"-o", "--output",
default="output",
help="The directory to save the extracted files (default: 'output')."
)
# Parse the arguments provided by the user.
args = parser.parse_args()
# Call the main function with the parsed arguments.
main(args.har_file, args.output)