har_extractor/har_extractor.py at main · all666666all/har_extractor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
HAR Content Extractor

A command-line tool to extract and save all files, API responses, and web assets
from a HAR (.har) archive.
"""

import json
import base64
import os
import re
import argparse
from urllib.parse import urlparse
import binascii  # Required for specific exception handling

def main(har_filepath: str, output_dir: str) -> None:
    """
    Parses a HAR file to extract and save all HTTP response bodies.

    Args:
        har_filepath (str): The path to the input .har file.
        output_dir (str): The directory where extracted files will be saved.
    """
    # Ensure the output directory exists, creating it if necessary.
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    # Attempt to read and parse the HAR file.
    try:
        with open(har_filepath, 'r', encoding='utf-8') as f:
            har_data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{har_filepath}' was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: '{har_filepath}' is not a valid JSON file.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while reading the file: {e}")
        return

    # Iterate through each HTTP request-response pair in the HAR log.
    for idx, entry in enumerate(har_data['log']['entries']):
        # --- Step 1: Decode the response content ---

        # Skip entries that have no response content, e.g., 304 Not Modified.
        if 'content' not in entry['response'] or 'text' not in entry['response']['content']:
            continue

        url = entry['request']['url']
        content = entry['response']['content']
        encoding = content.get('encoding')

        data: bytes
        if encoding == 'base64':
            # Handle Base64 encoded content, typically for binary files like images.
            try:
                data = base64.b64decode(content['text'])
            except (ValueError, binascii.Error) as e:
                print(f"Warning: Could not decode Base64 content from {url}. Skipping. Error: {e}")
                continue
        else:
            # Assume plain text and encode to UTF-8 bytes.
            data = content['text'].encode('utf-8')

        # --- Step 2: Construct a safe and descriptive filename from the URL ---

        parsed_url = urlparse(url)

        # Start with the last part of the URL path (e.g., 'script.js' from '/js/script.js').
        filename = os.path.basename(parsed_url.path)

        # If the path is empty (e.g., 'https://example.com/'), use the netloc.
        if not filename:
            filename = parsed_url.netloc

        # Append the query string to the filename to preserve uniqueness for API calls.
        if parsed_url.query:
            # Sanitize the query string to be filesystem-friendly.
            safe_query = re.sub(r'[\\/*?:"<>|]', '_', parsed_url.query)
            filename = f"{filename}?{safe_query}"

        # A final, aggressive sanitization to remove any remaining illegal characters.
        # This is a critical step for ensuring compatibility with all major OS.
        filename = re.sub(r'[\\/*?:"<>|]', '', filename)

        # If, after all processing, the filename is empty, create a default name.
        if not filename:
            filename = f"default_filename_{idx}"

        # --- Step 3: Write the decoded content to a file ---

        save_path = os.path.join(output_dir, filename)

        try:
            with open(save_path, 'wb') as f:
                f.write(data)
            print(f"Saved: {save_path}")
        except OSError as e:
            # Handle filesystem errors, e.g., filename too long or permission denied.
            print(f"Error: Could not write file {save_path}. OS Error: {e}")
        except Exception as e:
            print(f"An unknown error occurred while saving {save_path}: {e}")


if __name__ == "__main__":
    # This block sets up the command-line interface for the script.
    parser = argparse.ArgumentParser(
        description="Extracts and saves all HTTP response bodies from a HAR file.",
        epilog="Example: python har_extractor.py my_archive.har -o extracted_files"
    )

    parser.add_argument(
        "har_file",
        help="The path to the input .har file."
    )

    parser.add_argument(
        "-o", "--output",
        default="output",
        help="The directory to save the extracted files (default: 'output')."
    )

    # Parse the arguments provided by the user.
    args = parser.parse_args()

    # Call the main function with the parsed arguments.
    main(args.har_file, args.output)