Skip to content

Commit d1b3110

Browse files
add CRC32C variant and file processing API with Hasher enhancements
1 parent b582409 commit d1b3110

File tree

6 files changed

+1085
-44
lines changed

6 files changed

+1085
-44
lines changed

README.md

Lines changed: 96 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,62 +14,128 @@ pip install pycrc32
1414
```
1515

1616
## Usage
17+
18+
### Basic CRC32 and CRC32C
1719
```python
18-
from pycrc32 import crc32
20+
from pycrc32 import crc32, crc32c
1921

2022
data = b"123456789"
21-
print(f"crc32 for {data!r} is {crc32(data)}")
23+
24+
# Standard CRC32 (IEEE 802.3 polynomial)
25+
print(f"CRC32 for {data!r}: {crc32(data):#x}")
26+
27+
# CRC32C (Castagnoli polynomial, used in iSCSI, Ethernet, etc.)
28+
print(f"CRC32C for {data!r}: {crc32c(data):#x}")
2229
```
2330

24-
### Advanced Checksum Calculation with `Hasher`
25-
For scenarios that require more flexibility, such as processing large amounts of data or computing the checksum in stages, you can use the `Hasher` class:
31+
### Incremental Hashing with `Hasher` Class
32+
33+
The `Hasher` class provides incremental hashing capabilities for processing large data in chunks:
34+
35+
#### Basic Incremental Hashing
2636
```python
2737
from pycrc32 import Hasher
2838

2939
# Create a new Hasher instance
3040
hasher = Hasher()
3141

32-
# Update the hasher with data chunks
42+
# Update with data chunks
3343
hasher.update(b"123456")
3444
hasher.update(b"789")
3545

36-
# Finalize the computation and get the checksum
46+
# Get the final checksum
3747
checksum = hasher.finalize()
38-
print(f"Checksum: {checksum}")
48+
print(f"Checksum: {checksum:#x}")
49+
print(f"Bytes processed: {len(hasher)}")
50+
print(f"Hasher state: {repr(hasher)}")
51+
```
52+
53+
#### Advanced Hasher Features
54+
```python
55+
# Initialize with custom initial state
56+
hasher = Hasher.with_initial(0x12345678)
57+
hasher.update(b"data")
58+
result = hasher.finalize()
59+
60+
# Create independent copies
61+
hasher1 = Hasher()
62+
hasher1.update(b"common")
63+
hasher2 = hasher1.copy() # Independent copy
64+
hasher2.update(b"additional")
3965

40-
# Reset the hasher to compute another checksum
66+
print(f"Original: {hasher1.finalize():#x}") # Only "common"
67+
print(f"Copy: {hasher2.finalize():#x}") # "common" + "additional"
68+
69+
# Context manager usage
70+
with Hasher() as ctx_hasher:
71+
ctx_hasher.update(b"context data")
72+
result = ctx_hasher.finalize()
73+
74+
# Reset functionality
4175
hasher.reset()
42-
hasher.update(b"The quick brown fox jumps over the lazy dog")
43-
new_checksum = hasher.finalize()
44-
print(f"New checksum: {new_checksum}")
76+
print(f"After reset: {hasher.finalize():#x}")
77+
78+
# Combine states (for parallel processing)
79+
hasher1 = Hasher()
80+
hasher1.update(b"part1")
81+
hasher2 = Hasher()
82+
hasher2.update(b"part2")
83+
hasher1.combine(hasher2)
84+
combined = hasher1.finalize()
4585
```
4686

47-
You can also initialize a `Hasher` with a specific initial CRC32 state:
87+
### File Processing
4888
```python
49-
initial_crc = 12345678
50-
hasher = Hasher.with_initial(initial_crc)
51-
52-
hasher.update(b"additional data")
53-
final_checksum = hasher.finalize()
54-
print(f"Final checksum with initial state: {final_checksum}")
89+
from pycrc32 import crc32_file, crc32_fileobj
90+
91+
# Process files by path
92+
file_crc = crc32_file("/path/to/file.txt")
93+
print(f"File CRC32: {file_crc:#x}")
94+
95+
# Process file objects
96+
with open("/path/to/file.txt", "rb") as f:
97+
fileobj_crc = crc32_fileobj(f)
98+
print(f"File object CRC32: {fileobj_crc:#x}")
99+
100+
# Works with any file-like object (BytesIO, etc.)
101+
import io
102+
data = b"file-like data"
103+
bio = io.BytesIO(data)
104+
bio_crc = crc32_fileobj(bio)
105+
print(f"BytesIO CRC32: {bio_crc:#x}")
55106
```
56107

57-
To combine checksums from different data blocks without needing to concatenate the data, use the `combine` method:
108+
### Enhanced Error Handling
58109
```python
59-
hasher1 = Hasher()
60-
hasher1.update(b"Data block 1")
61-
checksum1 = hasher1.finalize()
110+
from pycrc32 import crc32, Hasher
111+
112+
# Provides helpful error messages
113+
try:
114+
crc32("invalid string")
115+
except TypeError as e:
116+
print(f"Clear error message: {e}")
117+
# Output: crc32() expects bytes-like object, got string. Use b'your string' or your_string.encode() instead.
118+
119+
try:
120+
crc32_file("/nonexistent/file.txt")
121+
except FileNotFoundError as e:
122+
print(f"File error: {e}")
123+
# Output: File not found: /nonexistent/file.txt
124+
```
62125

63-
hasher2 = Hasher()
64-
hasher2.update(b"Data block 2")
65-
checksum2 = hasher2.finalize()
126+
### Type Safety and IDE Support
127+
```python
128+
# Full type hints available
129+
from pycrc32 import crc32, crc32c, Hasher, crc32_file, crc32_fileobj
130+
from typing import Union
66131

67-
# Combine checksums from hasher1 into hasher2
68-
hasher1.combine(hasher2) # Combine the state of hasher2 into hasher1
132+
def process_data(data: Union[bytes, bytearray]) -> int:
133+
"""Function with full type hints."""
134+
return crc32(data)
69135

70-
# The final checksum after combination
71-
combined_checksum = hasher1.finalize()
72-
print(f"Combined checksum: {combined_checksum}")
136+
# IDE autocompletion and inline documentation
137+
hasher: Hasher = Hasher() # Type annotation
138+
hasher.update(b"data") # IDE shows method signatures
73139
```
74140

75141
## Speed

examples/usage.py

Lines changed: 199 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,200 @@
1-
from pycrc32 import crc32
1+
"""
2+
Comprehensive example demonstrating all pycrc32 features.
23
3-
data = b"123456789"
4-
print(f"crc32 for {data!r} is {crc32(data)}")
4+
This example shows:
5+
- Basic CRC32 computation
6+
- CRC32C variant (Castagnoli polynomial)
7+
- Incremental hashing with Hasher
8+
- Advanced Hasher features (copy, context manager, etc.)
9+
- File processing
10+
- Error handling
11+
"""
12+
13+
from pycrc32 import crc32, crc32c, Hasher, crc32_file, crc32_fileobj
14+
import tempfile
15+
import io
16+
17+
18+
def basic_usage():
19+
"""Demonstrate basic CRC32 and CRC32C usage."""
20+
print("=== Basic Usage ===")
21+
22+
data = b"123456789"
23+
24+
# Standard CRC32 (IEEE 802.3 polynomial)
25+
crc32_result = crc32(data)
26+
print(f"Standard CRC32 for {data!r}: {crc32_result:#x} ({crc32_result})")
27+
28+
# CRC32C (Castagnoli polynomial, used in iSCSI, Ethernet, etc.)
29+
crc32c_result = crc32c(data)
30+
print(f"CRC32C for {data!r}: {crc32c_result:#x} ({crc32c_result})")
31+
32+
# They should be different
33+
print(f"Different algorithms: {crc32_result != crc32c_result}")
34+
print()
35+
36+
37+
def incremental_hashing():
38+
"""Demonstrate incremental hashing features."""
39+
print("=== Incremental Hashing ===")
40+
41+
data_chunks = [b"chunk1", b"chunk2", b"chunk3"]
42+
43+
# Basic incremental hashing
44+
hasher = Hasher()
45+
for chunk in data_chunks:
46+
hasher.update(chunk)
47+
print(f"Added {chunk!r}, bytes processed: {len(hasher)}")
48+
49+
final_result = hasher.finalize()
50+
print(f"Final CRC32: {final_result:#x}")
51+
print(f"Hasher state: {repr(hasher)}")
52+
print()
53+
54+
# Using copy method
55+
hasher1 = Hasher()
56+
hasher1.update(b"first")
57+
hasher2 = hasher1.copy() # Independent copy
58+
hasher2.update(b"second")
59+
60+
print(f"Original: {hasher1.finalize():#x}") # Only "first"
61+
print(f"Copy: {hasher2.finalize():#x}") # "first" + "second"
62+
print()
63+
64+
# Context manager usage
65+
with Hasher() as ctx_hasher:
66+
ctx_hasher.update(b"context data")
67+
ctx_result = ctx_hasher.finalize()
68+
69+
print(f"Context manager result: {ctx_result:#x}")
70+
print()
71+
72+
73+
def advanced_hasher_features():
74+
"""Demonstrate advanced Hasher features."""
75+
print("=== Advanced Hasher Features ===")
76+
77+
# Hasher with initial state
78+
initial_value = 0x12345678
79+
hasher = Hasher.with_initial(initial_value)
80+
hasher.update(b"additional data")
81+
result = hasher.finalize()
82+
print(f"Hasher with initial value {initial_value:#x}: {result:#x}")
83+
84+
# Reset functionality
85+
hasher.reset()
86+
print(f"After reset - bytes processed: {len(hasher)}")
87+
print(f"After reset - CRC32: {hasher.finalize():#x}")
88+
89+
# Combine functionality
90+
hasher1 = Hasher()
91+
hasher1.update(b"part1")
92+
hasher2 = Hasher()
93+
hasher2.update(b"part2")
94+
95+
hasher1.combine(hasher2)
96+
combined_result = hasher1.finalize()
97+
direct_result = crc32(b"part1part2")
98+
99+
print(f"Combined result: {combined_result:#x}")
100+
print(f"Direct computation: {direct_result:#x}")
101+
print(f"Combine works correctly: {combined_result == direct_result}")
102+
print()
103+
104+
105+
def file_processing():
106+
"""Demonstrate file processing capabilities."""
107+
print("=== File Processing ===")
108+
109+
test_data = b"This is test data for file processing."
110+
111+
# Create a temporary file
112+
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
113+
tmp_file.write(test_data)
114+
tmp_file.flush()
115+
116+
# Process file by path
117+
file_result = crc32_file(tmp_file.name)
118+
print(f"CRC32 of file '{tmp_file.name}': {file_result:#x}")
119+
120+
# Process file as file object
121+
with open(tmp_file.name, 'rb') as f:
122+
fileobj_result = crc32_fileobj(f)
123+
print(f"CRC32 of file object: {fileobj_result:#x}")
124+
125+
# Verify consistency
126+
direct_result = crc32(test_data)
127+
print(f"Direct computation: {direct_result:#x}")
128+
print(f"All methods match: {file_result == fileobj_result == direct_result}")
129+
130+
# Process in-memory file-like object
131+
bio = io.BytesIO(test_data)
132+
bio_result = crc32_fileobj(bio)
133+
print(f"CRC32 of BytesIO: {bio_result:#x}")
134+
print()
135+
136+
137+
def error_handling():
138+
"""Demonstrate error handling."""
139+
print("=== Error Handling ===")
140+
141+
# These operations will raise informative exceptions
142+
try:
143+
crc32("invalid string input")
144+
except TypeError as e:
145+
print(f"Expected error for string input: {e}")
146+
147+
try:
148+
crc32(12345)
149+
except TypeError as e:
150+
print(f"Expected error for int input: {e}")
151+
152+
try:
153+
crc32_file("/nonexistent/file.txt")
154+
except FileNotFoundError as e:
155+
print(f"Expected error for nonexistent file: {e}")
156+
157+
try:
158+
crc32_fileobj("not a file object")
159+
except TypeError as e:
160+
print(f"Expected error for invalid file object: {e}")
161+
print()
162+
163+
164+
def performance_comparison():
165+
"""Simple performance comparison."""
166+
import time
167+
168+
print("=== Simple Performance Test ===")
169+
170+
data = b"performance test data" * 1000 # ~23KB
171+
iterations = 1000
172+
173+
# Time standard CRC32
174+
start_time = time.time()
175+
for _ in range(iterations):
176+
crc32(data)
177+
crc32_time = time.time() - start_time
178+
179+
# Time CRC32C
180+
start_time = time.time()
181+
for _ in range(iterations):
182+
crc32c(data)
183+
crc32c_time = time.time() - start_time
184+
185+
print(f"Processed {len(data) * iterations} bytes")
186+
print(f"CRC32 time: {crc32_time:.4f}s")
187+
print(f"CRC32C time: {crc32c_time:.4f}s")
188+
print(f"CRC32C overhead: {(crc32c_time / crc32_time - 1) * 100:.1f}%")
189+
print()
190+
191+
192+
if __name__ == "__main__":
193+
basic_usage()
194+
incremental_hashing()
195+
advanced_hasher_features()
196+
file_processing()
197+
error_handling()
198+
performance_comparison()
199+
200+
print("All pycrc32 features demonstrated successfully!")

pycrc32/__init__.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,30 @@
1-
from .pycrc32 import Hasher, crc32
1+
"""
2+
pycrc32 - Python module for SIMD-accelerated CRC32 checksum computation.
3+
4+
This package provides high-performance CRC32 computation using Rust's crc32fast library
5+
with Python bindings via PyO3.
6+
7+
Basic usage:
8+
>>> from pycrc32 import crc32
9+
>>> crc32(b"hello")
10+
907060870
11+
12+
Advanced usage with incremental hashing:
13+
>>> from pycrc32 import Hasher
14+
>>> hasher = Hasher()
15+
>>> hasher.update(b"first chunk")
16+
>>> hasher.update(b"second chunk")
17+
>>> hasher.finalize()
18+
1234567890
19+
"""
20+
21+
from typing import IO, Union, Any
22+
from .pycrc32 import Hasher, crc32, crc32_file, crc32_fileobj, crc32c_func as crc32c
23+
24+
__all__ = ["Hasher", "crc32", "crc32c", "crc32_file", "crc32_fileobj"]
25+
__version__ = "0.3.0"
26+
27+
# Type aliases for better documentation
28+
FilePath = Union[str, bytes]
29+
FileObject = IO[bytes]
30+
DataInput = Union[bytes, bytearray, memoryview, Any]

0 commit comments

Comments
 (0)