IABTechLab
diff --git a/‎.python-version‎
Lines changed: 1 addition & 0 deletions b/‎.python-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 4 additions & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎__pycache__/compare_delta_folders.cpython-313.pyc‎
18.5 KB b/‎__pycache__/compare_delta_folders.cpython-313.pyc‎
18.5 KB
diff --git a/‎analyze_duplicates.py‎
Lines changed: 185 additions & 0 deletions b/‎analyze_duplicates.py‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎data/optout/producer/delta/optout-delta--01_2025-11-04T21.37.09Z_d475bd37.dat‎
288 Bytes b/‎data/optout/producer/delta/optout-delta--01_2025-11-04T21.37.09Z_d475bd37.dat‎
288 Bytes
diff --git a/‎debug_delta_file.py‎
Lines changed: 77 additions & 0 deletions b/‎debug_delta_file.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎main.py‎
Lines changed: 6 additions & 0 deletions b/‎main.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎manual-override.json‎
Lines changed: 3 additions & 0 deletions b/‎manual-override.json‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎optout-delta-001_2025-12-07T23.08.06Z_8a6d9d90.dat‎
216 Bytes b/‎optout-delta-001_2025-12-07T23.08.06Z_8a6d9d90.dat‎
216 Bytes
diff --git a/‎pyproject.toml‎
Lines changed: 9 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1 @@
+3.13
@@ -0,0 +1,4 @@
+{
+    "java.compile.nullAnalysis.mode": "automatic",
+    "java.configuration.updateBuildConfiguration": "interactive"
+}
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""
+Analyze why there's >3x duplication in delta files with exactly 3 replicas.
+Checks if the same identity_hash+advertising_id appears with different timestamps.
+"""
+
+import argparse
+import struct
+import sys
+from collections import defaultdict
+from datetime import datetime
+
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+except ImportError:
+    print("Error: boto3 not installed. Run: pip install boto3")
+    sys.exit(1)
+
+
+class OptOutRecord:
+    ENTRY_SIZE = 72
+
+    def __init__(self, identity_hash: bytes, advertising_id: bytes, timestamp: int):
+        self.identity_hash = identity_hash
+        self.advertising_id = advertising_id
+        self.timestamp = timestamp
+
+    def is_sentinel(self) -> bool:
+        return (self.identity_hash == b'\x00' * 32 or
+                self.identity_hash == b'\xff' * 32)
+
+    def key(self):
+        """Return identity for grouping (hash + id, no timestamp)"""
+        return (self.identity_hash, self.advertising_id)
+
+
+def parse_records_from_file(data: bytes):
+    """Parse records from delta file"""
+    records = []
+    offset = 0
+    MIN_VALID_TIMESTAMP = 1577836800
+    MAX_VALID_TIMESTAMP = 4102444800
+
+    while offset + OptOutRecord.ENTRY_SIZE <= len(data):
+        identity_hash = data[offset:offset + 32]
+        advertising_id = data[offset + 32:offset + 64]
+        timestamp_raw = struct.unpack('<Q', data[offset + 64:offset + 72])[0]
+        timestamp = timestamp_raw & 0xFFFFFFFFFFFFFF
+
+        record = OptOutRecord(identity_hash, advertising_id, timestamp)
+
+        if record.is_sentinel():
+            offset += OptOutRecord.ENTRY_SIZE
+            continue
+
+        if timestamp < MIN_VALID_TIMESTAMP or timestamp > MAX_VALID_TIMESTAMP:
+            offset += OptOutRecord.ENTRY_SIZE
+            continue
+
+        records.append(record)
+        offset += OptOutRecord.ENTRY_SIZE
+
+    return records
+
+
+def analyze_duplication(bucket: str, prefix: str, dates: list, max_files: int = 100):
+    """Analyze duplication patterns in delta files"""
+    
+    s3 = boto3.client('s3')
+    
+    # Group records by identity (hash + id)
+    records_by_identity = defaultdict(list)  # key -> [(timestamp, filename), ...]
+    total_entries = 0
+    files_processed = 0
+    
+    print(f"Analyzing duplication in s3://{bucket}/{prefix}")
+    print(f"Dates: {', '.join(dates)}")
+    print(f"Max files to process: {max_files}\n")
+    
+    for date in dates:
+        full_prefix = f"{prefix}{date}/"
+        print(f"📂 Listing files in {full_prefix}")
+        
+        paginator = s3.get_paginator('list_objects_v2')
+        for page in paginator.paginate(Bucket=bucket, Prefix=full_prefix):
+            if 'Contents' not in page:
+                continue
+                
+            for obj in page['Contents']:
+                if not obj['Key'].endswith('.dat'):
+                    continue
+                
+                if files_processed >= max_files:
+                    print(f"\n⚠️  Reached max files limit ({max_files})")
+                    break
+                    
+                filename = obj['Key'].split('/')[-1]
+                print(f"   [{files_processed + 1}] {filename}", end=' ', flush=True)
+                
+                try:
+                    response = s3.get_object(Bucket=bucket, Key=obj['Key'])
+                    data = response['Body'].read()
+                    records = parse_records_from_file(data)
+                    
+                    for record in records:
+                        key = record.key()
+                        records_by_identity[key].append((record.timestamp, filename))
+                    
+                    total_entries += len(records)
+                    files_processed += 1
+                    print(f"({len(records)} records)")
+                    
+                except Exception as e:
+                    print(f"ERROR: {e}")
+                    
+            if files_processed >= max_files:
+                break
+                
+        if files_processed >= max_files:
+            break
+    
+    print(f"\n📊 Analysis Results:")
+    print(f"   Files processed: {files_processed}")
+    print(f"   Total entries: {total_entries}")
+    print(f"   Unique identities: {len(records_by_identity)}")
+    print(f"   Average duplication: {total_entries / len(records_by_identity):.2f}x\n")
+    
+    # Analyze duplication patterns
+    exact_3x = 0
+    more_than_3x = 0
+    different_timestamps = 0
+    
+    print("🔍 Duplication Breakdown:")
+    for key, occurrences in records_by_identity.items():
+        count = len(occurrences)
+        if count == 3:
+            exact_3x += 1
+        elif count > 3:
+            more_than_3x += 1
+            
+        # Check if same identity has different timestamps (multiple submissions)
+        unique_timestamps = len(set(ts for ts, _ in occurrences))
+        if unique_timestamps > 1:
+            different_timestamps += 1
+    
+    print(f"   Identities appearing exactly 3x: {exact_3x} ({100 * exact_3x / len(records_by_identity):.1f}%)")
+    print(f"   Identities appearing >3x: {more_than_3x} ({100 * more_than_3x / len(records_by_identity):.1f}%)")
+    print(f"   Identities with multiple timestamps: {different_timestamps} ({100 * different_timestamps / len(records_by_identity):.1f}%)\n")
+    
+    # Show examples of >3x duplication
+    if more_than_3x > 0:
+        print("📋 Sample identities with >3x duplication:")
+        count = 0
+        for key, occurrences in records_by_identity.items():
+            if len(occurrences) > 3 and count < 5:
+                hash_hex = key[0].hex()[:16]
+                id_hex = key[1].hex()[:16]
+                print(f"\n   {count + 1}. hash={hash_hex}..., id={id_hex}... ({len(occurrences)} occurrences):")
+                
+                unique_timestamps = set(ts for ts, _ in occurrences)
+                print(f"      Unique timestamps: {len(unique_timestamps)}")
+                
+                for ts, filename in sorted(occurrences)[:10]:  # Show first 10
+                    dt = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
+                    print(f"         {dt} in {filename}")
+                
+                if len(occurrences) > 10:
+                    print(f"         ... and {len(occurrences) - 10} more")
+                
+                count += 1
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Analyze opt-out duplication patterns')
+    parser.add_argument('--bucket', required=True, help='S3 bucket name')
+    parser.add_argument('--prefix', required=True, help='S3 prefix (e.g., optout-v2/delta/)')
+    parser.add_argument('--date', action='append', dest='dates', required=True,
+                        help='Date folder (can specify multiple times)')
+    parser.add_argument('--max-files', type=int, default=100,
+                        help='Maximum number of files to analyze (default: 100)')
+    
+    args = parser.parse_args()
+    analyze_duplication(args.bucket, args.prefix, args.dates, args.max_files)
+
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Debug script to inspect a single delta file byte-by-byte"""
+
+import struct
+import sys
+
+def analyze_delta_file(filename):
+    """Analyze the structure of a delta file"""
+    with open(filename, 'rb') as f:
+        data = f.read()
+    
+    print(f"File: {filename}")
+    print(f"Total size: {len(data)} bytes")
+    print(f"Expected entries: {len(data) // 72}")
+    print()
+    
+    offset = 0
+    entry_num = 0
+    
+    while offset + 72 <= len(data):
+        entry_num += 1
+        print(f"Entry #{entry_num} at offset {offset}:")
+        
+        # Read components
+        identity_hash = data[offset:offset + 32]
+        advertising_id = data[offset + 32:offset + 64]
+        timestamp_bytes = data[offset + 64:offset + 72]
+        
+        # Try both byte orders
+        ts_little_8byte = struct.unpack('<Q', timestamp_bytes)[0]
+        ts_little_4byte = struct.unpack('<I', timestamp_bytes[:4])[0]
+        extra_4bytes = struct.unpack('<I', timestamp_bytes[4:])[0]
+        
+        print(f"  Identity Hash: {identity_hash.hex()[:32]}...")
+        print(f"  Advertising ID: {advertising_id.hex()[:32]}...")
+        print(f"  Timestamp bytes (hex): {timestamp_bytes.hex()}")
+        print(f"    Byte breakdown: {' '.join(f'{b:02x}' for b in timestamp_bytes)}")
+        print(f"    Low 4 bytes:  0x{timestamp_bytes[:4].hex()} = {ts_little_4byte}")
+        print(f"    High 4 bytes: 0x{timestamp_bytes[4:].hex()} = {extra_4bytes} (0x{extra_4bytes:08x})")
+        print(f"  Full 8-byte timestamp: {ts_little_8byte}")
+        
+        # Check if sentinel
+        is_null = identity_hash == b'\x00' * 32
+        is_ones = identity_hash == b'\xff' * 32
+        if is_null or is_ones:
+            print(f"  Type: SENTINEL ({'null' if is_null else 'ones'})")
+            from datetime import datetime
+            try:
+                dt = datetime.fromtimestamp(ts_little_8byte)
+                print(f"  Valid timestamp: {dt.strftime('%Y-%m-%d %H:%M:%S')}")
+            except:
+                print(f"  Invalid timestamp!")
+        else:
+            print(f"  Type: DATA")
+            # For data entries, check if first 4 bytes make sense
+            try:
+                from datetime import datetime
+                dt = datetime.fromtimestamp(ts_little_4byte)
+                print(f"  If 4-byte timestamp: {dt.strftime('%Y-%m-%d %H:%M:%S')}")
+                print(f"  Extra data in high bytes: 0x{extra_4bytes:08x}")
+            except:
+                pass
+        
+        print()
+        offset += 72
+        
+        if entry_num >= 5:  # Only show first 5 entries
+            print(f"... ({(len(data) // 72) - 5} more entries)")
+            break
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python3 debug_delta_file.py <delta-file.dat>")
+        sys.exit(1)
+    
+    analyze_delta_file(sys.argv[1])
+
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from uid2-optout!")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,3 @@
+{
+    "manual_override": ""
+}
@@ -0,0 +1,9 @@
+[project]
+name = "uid2-optout"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "boto3>=1.40.68",
+]
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "java.compile.nullAnalysis.mode": "automatic",
 +    "java.configuration.updateBuildConfiguration": "interactive"
 +}