pikulev · pikulev · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025
diff --git a/.coveragerc b/.coveragerc
@@ -1,6 +1,6 @@
 [run]
 omit =
-    src/hario_core/interfaces.py
+    src/hario_core/**/interfaces.py
 
 [report]
 exclude_lines =

diff --git a/.flake8 b/.flake8
@@ -1,5 +1,3 @@
 [flake8]
 max-line-length = 88
-extend-ignore = E203
-exclude =
-    tests/samples.py
+extend-ignore = E203
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -22,11 +22,10 @@ jobs:
           python -m pip install --upgrade pip
           pip install .[dev]
 
-      - name: Lint with flake8, black, mypy
+      - name: Lint with pre-commit
         run: |
-          flake8 .
-          black --check .
-          mypy -p hario_core
+          pre-commit run --all-files
+
       - name: Test with pytest
         run: |
           pytest --cov --cov-branch --cov-report=xml

diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,11 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+# Benchmarks
+benchmarks/*.har
+benchmarks/*.stats
+benchmarks/*.csv
+
 # C extensions
 *.so
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,20 +3,27 @@ repos:
     rev: 24.3.0
     hooks:
       - id: black
+        files: ^src/|^tests/
+        exclude: ^tests/samples.py$
   - repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:
       - id: isort
+        files: ^src/|^tests/
+        exclude: ^tests/samples.py$
   - repo: https://github.com/pycqa/flake8
     rev: 6.0.0
     hooks:
       - id: flake8
+        files: ^src/|^tests/
+        exclude: ^tests/samples.py$
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.7.1
     hooks:
       - id: mypy
+        files: ^src/|^tests/
+        exclude: ^tests/samples.py$
         args: ["--config-file=pyproject.toml"]
-        exclude: ^(docs|examples)/
         additional_dependencies:
           - pydantic
 
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ A modern, extensible, and type-safe Python library for parsing, transforming, an
 ## Features
 
 - **Type-Safe Parsing**: Validates HAR files using Pydantic models, catching errors early.
-- **Transformers**: Apply built-in or custom transformations to each HAR entry (e.g., flattening, normalization).
+- **Transformers**: Apply built-in or custom transformations to each HAR entry (e.g., flatten, normalizations).
 - **Normalization**: Ensures all numeric fields (sizes, timings) are non-negative, so you can safely sum, aggregate, and analyze data without errors from negative values. This is crucial for analytics and reporting.
 - **Deterministic & Random IDs**: Generate unique or deterministic IDs for each entry. Deterministic IDs ensure that the same request always gets the same ID—useful for deduplication, comparison, and building analytics pipelines.
 - **Extensible**: Register your own entry models to support browser-specific or proprietary HAR extensions (e.g., Chrome DevTools, Safari).
@@ -24,32 +24,65 @@ pip install hario-core
 
 ## Quickstart
 
+### 1. Parse and validate a HAR file
+
+```python
+from hario_core import parse
+
+har_log = parse("example.har")
+entries = har_log.model_dump()["entries"]  # list of dicts
+```
+
+### 2. Transform entries with a pipeline
+
+```python
+from hario_core.transform import Pipeline, flatten, set_id, by_field
+
+pipeline = Pipeline([
+    set_id(by_field(["request.url", "startedDateTime"]))
+])
+results = pipeline.process(entries)
+```
+
+### 3. Custom entry models (extensions)
+
 ```python
-from hario_core import parse, Pipeline, by_field, normalize_sizes, flatten
+from hario_core.parse import register_entry_model
+from hario_core.models import Entry
 
-# Build a processing pipeline: deterministic ID, normalization, flattening
-pipeline = Pipeline(
-    id_fn=by_field(["request.url", "startedDateTime"]),
-    transformers=[normalize_sizes(), flatten()],
-)
+def is_custom_entry(entry: dict) -> bool:
+    return "x-custom" in entry
 
-# Parse your HAR file (from path, bytes, or file-like object)
-model = parse("example.har")
-result_dict = pipeline.process(model)
+class CustomEntry(Entry):
+    x_custom: str
 
-for entry in result_dict:
-    print(entry["id"], entry["request"]["url"])
+register_entry_model(is_custom_entry, CustomEntry)
 ```
 
+## Public API
+
+### Parsing and validation
+- `parse(path_or_bytes_or_filelike) -> HarLog`
+- `validate(har_dict: dict) -> HarLog`
+- `register_entry_model(detector: Callable, model: Type[Entry])`
+- `entry_selector(entry_dict: dict) -> Type[Entry]`
+
+### Models
+- `Entry`, `HarLog`, `DevToolsEntry` (and all standard HAR 1.2 models)
+
+### Transform
+- `Pipeline`, `flatten`, `normalize_sizes`, `normalize_timings`, `set_id`, `by_field`, `uuid`, `json_array_handler`
+
 ## Documentation
 
-- [API Reference](docs/api.md)
-- [Changelog](docs/changelog.md)
-- [Contributing](CONTRIBUTING.md)
+- [API Reference](https://github.com/pikulev/hario-core/blob/main/docs/api.md)
+- [Changelog](https://github.com/pikulev/hario-core/blob/main/docs/changelog.md)
+- [Contributing](https://github.com/pikulev/hario-core/blob/main/CONTRIBUTING.md)
+
 
 ## License
 
-MIT License. See [LICENSE](LICENSE).
+MIT License. See [LICENSE](https://github.com/pikulev/hario-core/blob/main/LICENSE).
 
 ## Supported Python Versions
 

diff --git a/benchmarks/bench.py b/benchmarks/bench.py
@@ -0,0 +1,111 @@
+from bench_core import (
+    STRATEGIES, HAR_PATH,
+    bench_flatten, bench_full, bench_normalize_sizes, bench_normalize_timings, bench_cpu_heavy,
+    create_results_table, create_results_csv, average_run, get_entries
+)
+from rich.console import Console
+import argparse
+import cProfile
+import pstats
+import sys
+
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="""
+        Microbenchmark for HAR Pipeline with different strategies, averaging, profiling and CSV output.
+
+        Example usage:
+          python bench.py flatten -f my.har --no-gc --csv results.csv
+          python bench.py --csv all_results.csv
+          python bench.py full --profile process
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "mode",
+        nargs="?",
+        default="all",
+        choices=["flatten", "normalize", "full", "cpu_heavy", "all"],
+        help="Benchmark mode: flatten, normalize, full, cpu_heavy, all (default: all)"
+    )
+    parser.add_argument(
+        "-f", "--file",
+        default=HAR_PATH,
+        help="Path to HAR file (default: benchmarks/test_lg.har)"
+    )
+    parser.add_argument(
+        "--profile",
+        choices=STRATEGIES,
+        help="Enable cProfile profiling for the given strategy (e.g. --profile process)"
+    )
+    parser.add_argument(
+        "--no-gc",
+        action="store_true",
+        help="Disable GC during measurement (default: GC enabled)"
+    )
+    parser.add_argument(
+        "--csv",
+        nargs="?",
+        const="-",
+        help="Save results to CSV file (or stdout if no file is specified)"
+    )
+    args = parser.parse_args()
+
+    mode = args.mode
+    har_path = args.file
+    profile_strategy = args.profile
+    use_gc = not args.no_gc
+
+    console = Console()
+    console.print(f"Loading HAR file: {har_path} ...")
+    entries = get_entries(har_path)
+    console.print(f"Loaded {len(entries)} entries.")
+
+    bench_map = {
+        "flatten": bench_flatten,
+        "normalize_sizes": bench_normalize_sizes,
+        "normalize_timings": bench_normalize_timings,
+        "full": bench_full,
+        "cpu_heavy": bench_cpu_heavy,
+    }
+
+    if profile_strategy:
+        if mode == "all":
+            console.print("[red]Profiling is only available for a single mode, not for 'all'.[/red]")
+            sys.exit(1)
+        bench_func = bench_map[mode]
+        profile_file = f"benchmarks/{mode}.stats"
+        console.print(f"Profiling... profile saved in {profile_file} (strategy: {profile_strategy})")
+        def prof():
+            bench_func(entries, profile_strategy, use_gc=use_gc)
+        cProfile.runctx("prof()", globals(), locals(), profile_file)
+        p = pstats.Stats(profile_file)
+        console.print("\n=== TOP-20 functions by cumtime ===\n")
+        p.strip_dirs().sort_stats("cumtime").print_stats(20)
+        console.print(f"\n=== For visualization, run: snakeviz {profile_file} ===")
+    else:
+        results = {strategy: {} for strategy in STRATEGIES}
+        if mode == "all":
+            for test_name, bench_func in bench_map.items():
+                for strategy in STRATEGIES:
+                    console.print(f"\n[bold]Running {test_name} with {strategy} strategy...[/bold]")
+                    elapsed, current, peak, rss = average_run(bench_func, entries, strategy, use_gc=use_gc)
+                    results[strategy][test_name] = (elapsed, current, peak, rss)
+        else:
+            bench_func = bench_map[mode]
+            for strategy in STRATEGIES:
+                console.print(f"\n[bold]Running {mode} with {strategy} strategy...[/bold]")
+                elapsed, current, peak, rss = average_run(bench_func, entries, strategy, use_gc=use_gc)
+                results[strategy][mode] = (elapsed, current, peak, rss)
+        # Display results table
+        table = create_results_table(results)
+        console.print(table)
+        # CSV output
+        if args.csv:
+            filename = None if args.csv == "-" else args.csv
+            create_results_csv(results, f"benchmarks/{filename}")
+
+if __name__ == "__main__":
+    main()