@@ -98,12 +98,14 @@ def from_dict(cls, data: dict) -> "AuditCriteria":
9898 {
9999 "image_path" : rf .get ("image_path" ) or rf .get ("imagePath" , "" ),
100100 "field_path" : rf .get ("field_path" ) or rf .get ("fieldPath" , "" ),
101+ "hq_url_path" : rf .get ("hq_url_path" ) or rf .get ("hqUrlPath" , "" ),
101102 "label" : rf .get ("label" , "" ),
102103 "filter_by_image" : rf .get ("filter_by_image" ) or rf .get ("filterByImage" , False ),
103104 "filter_by_field" : rf .get ("filter_by_field" ) or rf .get ("filterByField" , False ),
104105 }
105106 for rf in related_fields_raw
106- if (rf .get ("image_path" ) or rf .get ("imagePath" )) and (rf .get ("field_path" ) or rf .get ("fieldPath" ))
107+ # Require image_path; field_path is optional (image-only filter rules are valid)
108+ if rf .get ("image_path" ) or rf .get ("imagePath" )
107109 ]
108110
109111 return cls (
@@ -175,10 +177,17 @@ def filter_visits_for_audit(
175177 if criteria .selected_flw_user_ids and "username" in df .columns :
176178 df = df [df ["username" ].isin (criteria .selected_flw_user_ids )]
177179
178- # Apply sample percentage
180+ # Apply sample percentage — sample per FLW for equal representation, then shuffle
179181 if criteria .sample_percentage < 100 and len (df ) > 0 :
180- sample_size = max (1 , int (len (df ) * criteria .sample_percentage / 100 ))
181- df = df .sample (n = min (sample_size , len (df )), random_state = 42 )
182+ if "username" in df .columns :
183+ groups = []
184+ for _ , grp in df .groupby ("username" , dropna = False ):
185+ n = max (1 , int (len (grp ) * criteria .sample_percentage / 100 ))
186+ groups .append (grp .sample (n = min (n , len (grp )), random_state = 42 ))
187+ df = pd .concat (groups ).sample (frac = 1 , random_state = 42 )
188+ else :
189+ sample_size = max (1 , int (len (df ) * criteria .sample_percentage / 100 ))
190+ df = df .sample (n = min (sample_size , len (df )), random_state = 42 )
182191
183192 if return_visits :
184193 return df .to_dict ("records" )
@@ -737,6 +746,89 @@ def extract_images_for_visits(
737746 if str (vid ) not in result :
738747 result [str (vid )] = []
739748
749+ # Build visit lookup once — shared by enrichment and fallback sections below
750+ visit_dict_by_id = {str (v .get ("id" , "" )): v for v in visit_dicts }
751+
752+ # Fetch cc_domain for building CommCareHQ attachment URLs (cached, ~1 API call per hour)
753+ cc_domain = None
754+ try :
755+ from commcare_connect .workflow .templates .mbw_monitoring .data_fetchers import fetch_opportunity_metadata
756+
757+ meta = fetch_opportunity_metadata (self .access_token , opp_id )
758+ cc_domain = meta .get ("cc_domain" )
759+ except Exception as e :
760+ # Intentionally broad: cc_domain is optional for URL construction; any failure
761+ # (network, missing key, unexpected format) should degrade gracefully, not block audit.
762+ logger .debug (f"[ImageExtract] Could not fetch cc_domain for hq_url construction: { e } " )
763+
764+ # Enrich Connect blob images with xform_id and build hq_url
765+ hq_base = settings .COMMCARE_HQ_URL .rstrip ("/" )
766+ for visit_id_str , images in result .items ():
767+ visit_data = visit_dict_by_id .get (visit_id_str , {})
768+ form_json = visit_data .get ("form_json" , {})
769+ xform_id = form_json .get ("id" ) or ""
770+ for img in images :
771+ img ["xform_id" ] = xform_id
772+ if cc_domain and xform_id and img .get ("name" ) and not img .get ("hq_url" ):
773+ img ["hq_url" ] = f"{ hq_base } /a/{ cc_domain } /api/form/attachment/{ xform_id } /{ img ['name' ]} "
774+
775+ # Fallback: for visits with no Connect blobs, extract CommCareHQ URL images
776+ # from form_json using related_fields rules.
777+ # Strategy 1: use hq_url_path (pre-computed URL stored in form JSON)
778+ # Strategy 2: extract filename from image_path, build HQ attachment URL
779+ # (used when hq_url_path is empty — e.g. dynamic image type discovery
780+ # can't resolve DataBindOnly XForm paths from the HQ app definition API)
781+ if related_fields :
782+ import hashlib
783+
784+ image_rules = [r for r in related_fields if r .get ("image_path" )]
785+ if image_rules :
786+ for visit_id_str , images in result .items ():
787+ visit_data = visit_dict_by_id .get (visit_id_str , {})
788+ form_json = visit_data .get ("form_json" , {})
789+ form_data = form_json .get ("form" , form_json )
790+ xform_id = form_json .get ("id" ) or ""
791+ username = visit_data .get ("username" ) or ""
792+ # Use form.meta.timeEnd for actual submission time; fall back to visit_date (date only)
793+ visit_date = form_data .get ("meta" , {}).get ("timeEnd" ) or visit_data .get ("visit_date" ) or ""
794+ entity_name = visit_data .get ("entity_name" ) or "No Entity"
795+ for rule in image_rules :
796+ hq_url_path = rule .get ("hq_url_path" , "" )
797+ image_path = rule .get ("image_path" , "" )
798+
799+ # Skip if this image type is already present (e.g. from Connect blob)
800+ if any (img .get ("question_id" ) == image_path for img in images ):
801+ continue
802+
803+ # Strategy 1: pre-computed URL field in form JSON
804+ hq_url = None
805+ if hq_url_path :
806+ extracted = self ._extract_field_value (form_data , hq_url_path )
807+ if extracted and isinstance (extracted , str ) and extracted .startswith ("http" ):
808+ hq_url = extracted
809+
810+ # Strategy 2: build URL from filename stored at image_path
811+ if not hq_url and cc_domain and xform_id and image_path :
812+ filename = self ._extract_field_value (form_data , image_path )
813+ if filename and isinstance (filename , str ) and not filename .startswith ("http" ):
814+ hq_url = f"{ hq_base } /a/{ cc_domain } /api/form/attachment/{ xform_id } /{ filename } "
815+
816+ if hq_url :
817+ blob_id = "hq_" + hashlib .sha256 (hq_url .encode ()).hexdigest ()[:16 ]
818+ name = hq_url_path .split ("/" )[- 1 ] if hq_url_path else image_path .split ("/" )[- 1 ]
819+ images .append (
820+ {
821+ "blob_id" : blob_id ,
822+ "hq_url" : hq_url ,
823+ "xform_id" : xform_id ,
824+ "name" : name ,
825+ "question_id" : image_path ,
826+ "username" : username ,
827+ "visit_date" : visit_date ,
828+ "entity_name" : entity_name ,
829+ }
830+ )
831+
740832 # Add related field values if rules provided
741833 if related_fields :
742834 if progress_callback :
@@ -773,25 +865,23 @@ def _filter_visits_by_related_fields(
773865 if not filter_rules :
774866 return visit_images
775867
868+ image_filter_paths = [r .get ("image_path" , "" ) for r in filter_rules if r .get ("filter_by_image" )]
869+ field_filter_rules = [r for r in filter_rules if r .get ("filter_by_field" )]
870+
776871 filtered_result = {}
777872 for visit_id , images in visit_images .items ():
778873 include_visit = True
779874
780- for rule in filter_rules :
781- image_path = rule . get ( "image_path" , "" )
782- field_path = rule .get ("field_path" , "" )
783- filter_by_image = rule . get ( "filter_by_image" , False )
784- filter_by_field = rule . get ( "filter_by_field" , False )
875+ # OR logic: include visit if it has ANY of the required image types
876+ if image_filter_paths :
877+ question_ids = { img .get ("question_id" ) for img in images }
878+ if not any ( p in question_ids for p in image_filter_paths ):
879+ include_visit = False
785880
786- # Check if this visit has the required image
787- if filter_by_image :
788- has_matching_image = any (img .get ("question_id" ) == image_path for img in images )
789- if not has_matching_image :
790- include_visit = False
791- break
792-
793- # Check if this visit has the required field value
794- if filter_by_field :
881+ # AND logic: visit must satisfy every field filter rule
882+ if include_visit :
883+ for rule in field_filter_rules :
884+ field_path = rule .get ("field_path" , "" )
795885 has_field_value = False
796886 for img in images :
797887 for rf in img .get ("related_fields" , []):
@@ -1136,7 +1226,7 @@ def create_audit_creation_job(
11361226 opportunities : list [dict ],
11371227 ) -> dict :
11381228 """Create an audit creation job record for tracking async creation."""
1139- from datetime import datetime
1229+ from datetime import datetime , timezone
11401230
11411231 data = {
11421232 "task_id" : task_id ,
@@ -1154,8 +1244,8 @@ def create_audit_creation_job(
11541244 },
11551245 "result" : None ,
11561246 "error" : None ,
1157- "created_at" : datetime .now ().isoformat (),
1158- "updated_at" : datetime .now ().isoformat (),
1247+ "created_at" : datetime .now (timezone . utc ).isoformat (),
1248+ "updated_at" : datetime .now (timezone . utc ).isoformat (),
11591249 }
11601250
11611251 record = self .labs_api .create_record (
@@ -1237,7 +1327,7 @@ def update_audit_creation_job(
12371327 error : str | None = None ,
12381328 ) -> dict | None :
12391329 """Update an audit creation job record."""
1240- from datetime import datetime
1330+ from datetime import datetime , timezone
12411331
12421332 from commcare_connect .labs .models import LocalLabsRecord
12431333
@@ -1267,7 +1357,7 @@ def update_audit_creation_job(
12671357 data ["result" ] = result
12681358 if error is not None :
12691359 data ["error" ] = error
1270- data ["updated_at" ] = datetime .now ().isoformat ()
1360+ data ["updated_at" ] = datetime .now (timezone . utc ).isoformat ()
12711361
12721362 # Save
12731363 updated = self .labs_api .update_record (
0 commit comments