Skip to content

Commit 2023858

Browse files
committed
rename
1 parent 52e7818 commit 2023858

16 files changed

+522
-144
lines changed

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.13

.vscode/settings.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"java.compile.nullAnalysis.mode": "automatic",
3+
"java.configuration.updateBuildConfiguration": "interactive"
4+
}
18.5 KB
Binary file not shown.

analyze_duplicates.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Analyze why there's >3x duplication in delta files with exactly 3 replicas.
4+
Checks if the same identity_hash+advertising_id appears with different timestamps.
5+
"""
6+
7+
import argparse
8+
import struct
9+
import sys
10+
from collections import defaultdict
11+
from datetime import datetime
12+
13+
try:
14+
import boto3
15+
from botocore.exceptions import ClientError
16+
except ImportError:
17+
print("Error: boto3 not installed. Run: pip install boto3")
18+
sys.exit(1)
19+
20+
21+
class OptOutRecord:
22+
ENTRY_SIZE = 72
23+
24+
def __init__(self, identity_hash: bytes, advertising_id: bytes, timestamp: int):
25+
self.identity_hash = identity_hash
26+
self.advertising_id = advertising_id
27+
self.timestamp = timestamp
28+
29+
def is_sentinel(self) -> bool:
30+
return (self.identity_hash == b'\x00' * 32 or
31+
self.identity_hash == b'\xff' * 32)
32+
33+
def key(self):
34+
"""Return identity for grouping (hash + id, no timestamp)"""
35+
return (self.identity_hash, self.advertising_id)
36+
37+
38+
def parse_records_from_file(data: bytes):
39+
"""Parse records from delta file"""
40+
records = []
41+
offset = 0
42+
MIN_VALID_TIMESTAMP = 1577836800
43+
MAX_VALID_TIMESTAMP = 4102444800
44+
45+
while offset + OptOutRecord.ENTRY_SIZE <= len(data):
46+
identity_hash = data[offset:offset + 32]
47+
advertising_id = data[offset + 32:offset + 64]
48+
timestamp_raw = struct.unpack('<Q', data[offset + 64:offset + 72])[0]
49+
timestamp = timestamp_raw & 0xFFFFFFFFFFFFFF
50+
51+
record = OptOutRecord(identity_hash, advertising_id, timestamp)
52+
53+
if record.is_sentinel():
54+
offset += OptOutRecord.ENTRY_SIZE
55+
continue
56+
57+
if timestamp < MIN_VALID_TIMESTAMP or timestamp > MAX_VALID_TIMESTAMP:
58+
offset += OptOutRecord.ENTRY_SIZE
59+
continue
60+
61+
records.append(record)
62+
offset += OptOutRecord.ENTRY_SIZE
63+
64+
return records
65+
66+
67+
def analyze_duplication(bucket: str, prefix: str, dates: list, max_files: int = 100):
68+
"""Analyze duplication patterns in delta files"""
69+
70+
s3 = boto3.client('s3')
71+
72+
# Group records by identity (hash + id)
73+
records_by_identity = defaultdict(list) # key -> [(timestamp, filename), ...]
74+
total_entries = 0
75+
files_processed = 0
76+
77+
print(f"Analyzing duplication in s3://{bucket}/{prefix}")
78+
print(f"Dates: {', '.join(dates)}")
79+
print(f"Max files to process: {max_files}\n")
80+
81+
for date in dates:
82+
full_prefix = f"{prefix}{date}/"
83+
print(f"📂 Listing files in {full_prefix}")
84+
85+
paginator = s3.get_paginator('list_objects_v2')
86+
for page in paginator.paginate(Bucket=bucket, Prefix=full_prefix):
87+
if 'Contents' not in page:
88+
continue
89+
90+
for obj in page['Contents']:
91+
if not obj['Key'].endswith('.dat'):
92+
continue
93+
94+
if files_processed >= max_files:
95+
print(f"\n⚠️ Reached max files limit ({max_files})")
96+
break
97+
98+
filename = obj['Key'].split('/')[-1]
99+
print(f" [{files_processed + 1}] {filename}", end=' ', flush=True)
100+
101+
try:
102+
response = s3.get_object(Bucket=bucket, Key=obj['Key'])
103+
data = response['Body'].read()
104+
records = parse_records_from_file(data)
105+
106+
for record in records:
107+
key = record.key()
108+
records_by_identity[key].append((record.timestamp, filename))
109+
110+
total_entries += len(records)
111+
files_processed += 1
112+
print(f"({len(records)} records)")
113+
114+
except Exception as e:
115+
print(f"ERROR: {e}")
116+
117+
if files_processed >= max_files:
118+
break
119+
120+
if files_processed >= max_files:
121+
break
122+
123+
print(f"\n📊 Analysis Results:")
124+
print(f" Files processed: {files_processed}")
125+
print(f" Total entries: {total_entries}")
126+
print(f" Unique identities: {len(records_by_identity)}")
127+
print(f" Average duplication: {total_entries / len(records_by_identity):.2f}x\n")
128+
129+
# Analyze duplication patterns
130+
exact_3x = 0
131+
more_than_3x = 0
132+
different_timestamps = 0
133+
134+
print("🔍 Duplication Breakdown:")
135+
for key, occurrences in records_by_identity.items():
136+
count = len(occurrences)
137+
if count == 3:
138+
exact_3x += 1
139+
elif count > 3:
140+
more_than_3x += 1
141+
142+
# Check if same identity has different timestamps (multiple submissions)
143+
unique_timestamps = len(set(ts for ts, _ in occurrences))
144+
if unique_timestamps > 1:
145+
different_timestamps += 1
146+
147+
print(f" Identities appearing exactly 3x: {exact_3x} ({100 * exact_3x / len(records_by_identity):.1f}%)")
148+
print(f" Identities appearing >3x: {more_than_3x} ({100 * more_than_3x / len(records_by_identity):.1f}%)")
149+
print(f" Identities with multiple timestamps: {different_timestamps} ({100 * different_timestamps / len(records_by_identity):.1f}%)\n")
150+
151+
# Show examples of >3x duplication
152+
if more_than_3x > 0:
153+
print("📋 Sample identities with >3x duplication:")
154+
count = 0
155+
for key, occurrences in records_by_identity.items():
156+
if len(occurrences) > 3 and count < 5:
157+
hash_hex = key[0].hex()[:16]
158+
id_hex = key[1].hex()[:16]
159+
print(f"\n {count + 1}. hash={hash_hex}..., id={id_hex}... ({len(occurrences)} occurrences):")
160+
161+
unique_timestamps = set(ts for ts, _ in occurrences)
162+
print(f" Unique timestamps: {len(unique_timestamps)}")
163+
164+
for ts, filename in sorted(occurrences)[:10]: # Show first 10
165+
dt = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
166+
print(f" {dt} in {filename}")
167+
168+
if len(occurrences) > 10:
169+
print(f" ... and {len(occurrences) - 10} more")
170+
171+
count += 1
172+
173+
174+
if __name__ == '__main__':
175+
parser = argparse.ArgumentParser(description='Analyze opt-out duplication patterns')
176+
parser.add_argument('--bucket', required=True, help='S3 bucket name')
177+
parser.add_argument('--prefix', required=True, help='S3 prefix (e.g., optout-v2/delta/)')
178+
parser.add_argument('--date', action='append', dest='dates', required=True,
179+
help='Date folder (can specify multiple times)')
180+
parser.add_argument('--max-files', type=int, default=100,
181+
help='Maximum number of files to analyze (default: 100)')
182+
183+
args = parser.parse_args()
184+
analyze_duplication(args.bucket, args.prefix, args.dates, args.max_files)
185+
Binary file not shown.

debug_delta_file.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/usr/bin/env python3
2+
"""Debug script to inspect a single delta file byte-by-byte"""
3+
4+
import struct
5+
import sys
6+
7+
def analyze_delta_file(filename):
8+
"""Analyze the structure of a delta file"""
9+
with open(filename, 'rb') as f:
10+
data = f.read()
11+
12+
print(f"File: {filename}")
13+
print(f"Total size: {len(data)} bytes")
14+
print(f"Expected entries: {len(data) // 72}")
15+
print()
16+
17+
offset = 0
18+
entry_num = 0
19+
20+
while offset + 72 <= len(data):
21+
entry_num += 1
22+
print(f"Entry #{entry_num} at offset {offset}:")
23+
24+
# Read components
25+
identity_hash = data[offset:offset + 32]
26+
advertising_id = data[offset + 32:offset + 64]
27+
timestamp_bytes = data[offset + 64:offset + 72]
28+
29+
# Try both byte orders
30+
ts_little_8byte = struct.unpack('<Q', timestamp_bytes)[0]
31+
ts_little_4byte = struct.unpack('<I', timestamp_bytes[:4])[0]
32+
extra_4bytes = struct.unpack('<I', timestamp_bytes[4:])[0]
33+
34+
print(f" Identity Hash: {identity_hash.hex()[:32]}...")
35+
print(f" Advertising ID: {advertising_id.hex()[:32]}...")
36+
print(f" Timestamp bytes (hex): {timestamp_bytes.hex()}")
37+
print(f" Byte breakdown: {' '.join(f'{b:02x}' for b in timestamp_bytes)}")
38+
print(f" Low 4 bytes: 0x{timestamp_bytes[:4].hex()} = {ts_little_4byte}")
39+
print(f" High 4 bytes: 0x{timestamp_bytes[4:].hex()} = {extra_4bytes} (0x{extra_4bytes:08x})")
40+
print(f" Full 8-byte timestamp: {ts_little_8byte}")
41+
42+
# Check if sentinel
43+
is_null = identity_hash == b'\x00' * 32
44+
is_ones = identity_hash == b'\xff' * 32
45+
if is_null or is_ones:
46+
print(f" Type: SENTINEL ({'null' if is_null else 'ones'})")
47+
from datetime import datetime
48+
try:
49+
dt = datetime.fromtimestamp(ts_little_8byte)
50+
print(f" Valid timestamp: {dt.strftime('%Y-%m-%d %H:%M:%S')}")
51+
except:
52+
print(f" Invalid timestamp!")
53+
else:
54+
print(f" Type: DATA")
55+
# For data entries, check if first 4 bytes make sense
56+
try:
57+
from datetime import datetime
58+
dt = datetime.fromtimestamp(ts_little_4byte)
59+
print(f" If 4-byte timestamp: {dt.strftime('%Y-%m-%d %H:%M:%S')}")
60+
print(f" Extra data in high bytes: 0x{extra_4bytes:08x}")
61+
except:
62+
pass
63+
64+
print()
65+
offset += 72
66+
67+
if entry_num >= 5: # Only show first 5 entries
68+
print(f"... ({(len(data) // 72) - 5} more entries)")
69+
break
70+
71+
if __name__ == '__main__':
72+
if len(sys.argv) != 2:
73+
print("Usage: python3 debug_delta_file.py <delta-file.dat>")
74+
sys.exit(1)
75+
76+
analyze_delta_file(sys.argv[1])
77+

main.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
def main():
2+
print("Hello from uid2-optout!")
3+
4+
5+
if __name__ == "__main__":
6+
main()

manual-override.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"manual_override": ""
3+
}
216 Bytes
Binary file not shown.

pyproject.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[project]
2+
name = "uid2-optout"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.13"
7+
dependencies = [
8+
"boto3>=1.40.68",
9+
]

0 commit comments

Comments
 (0)