Add job metadata migration utility and optimize progress()

dimitri-yatsenko · claude · dimitri-yatsenko · commit f89be6ff2fe9 · 2026-01-03T22:01:24.000-06:00
- Add add_job_metadata_columns() migration utility to migrate.py - Adds hidden columns to existing Computed/Imported tables - Supports single tables or entire schemas - Dry-run mode for previewing changes - Optimize AutoPopulate.progress() with single aggregation query - Uses LEFT JOIN with COUNT(DISTINCT) for efficiency - Handles 1:many relationships correctly - Falls back to two-query method when no common attributes - Remove target property from AutoPopulate (always uses self) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/src/datajoint/autopopulate.py b/src/datajoint/autopopulate.py
@@ -127,7 +127,7 @@ def _rename_attributes(table, props):
             )
 
         if self._key_source is None:
-            parents = self.target.parents(primary=True, as_objects=True, foreign_key_info=True)
+            parents = self.parents(primary=True, as_objects=True, foreign_key_info=True)
             if not parents:
                 raise DataJointError("A table must have dependencies from its primary key for auto-populate to work")
             self._key_source = _rename_attributes(*parents[0])
@@ -204,15 +204,6 @@ def make(self, key):
         self.make_insert(key, *computed_result)
         yield
 
-    @property
-    def target(self):
-        """
-        :return: table to be populated.
-        In the typical case, dj.AutoPopulate is mixed into a dj.Table class by
-        inheritance and the target is self.
-        """
-        return self
-
     def _jobs_to_do(self, restrictions):
         """
         :return: the query yielding the keys to be computed (derived from self.key_source)
@@ -235,7 +226,7 @@ def _jobs_to_do(self, restrictions):
             raise DataJointError(
                 "The populate target lacks attribute %s "
                 "from the primary key of key_source"
-                % next(name for name in todo.heading.primary_key if name not in self.target.heading)
+                % next(name for name in todo.heading.primary_key if name not in self.heading)
             )
         except StopIteration:
             pass
@@ -324,7 +315,7 @@ def _populate_direct(
         Computes keys directly from key_source, suitable for single-worker
         execution, development, and debugging.
         """
-        keys = (self._jobs_to_do(restrictions) - self.target).fetch("KEY")
+        keys = (self._jobs_to_do(restrictions) - self).fetch("KEY")
 
         logger.debug("Found %d keys to populate" % len(keys))
 
@@ -493,14 +484,14 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_
         if not is_generator:
             self.connection.start_transaction()
 
-        if key in self.target:  # already populated
+        if key in self:  # already populated
             if not is_generator:
                 self.connection.cancel_transaction()
             if jobs is not None:
                 jobs.complete(key)
             return False
 
-        logger.debug(f"Making {key} -> {self.target.full_table_name}")
+        logger.debug(f"Making {key} -> {self.full_table_name}")
         self.__class__._allow_insert = True
 
         try:
@@ -531,7 +522,7 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_
                 exception=error.__class__.__name__,
                 msg=": " + str(error) if str(error) else "",
             )
-            logger.debug(f"Error making {key} -> {self.target.full_table_name} - {error_message}")
+            logger.debug(f"Error making {key} -> {self.full_table_name} - {error_message}")
             if jobs is not None:
                 jobs.error(key, error_message=error_message, error_stack=traceback.format_exc())
             if not suppress_errors or isinstance(error, SystemExit):
@@ -542,7 +533,7 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_
         else:
             self.connection.commit_transaction()
             duration = time.time() - start_time
-            logger.debug(f"Success making {key} -> {self.target.full_table_name}")
+            logger.debug(f"Success making {key} -> {self.full_table_name}")
 
             # Update hidden job metadata if table has the columns
             if self._has_job_metadata_attrs():
@@ -564,11 +555,61 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_
     def progress(self, *restrictions, display=False):
         """
         Report the progress of populating the table.
+
+        Uses a single aggregation query to efficiently compute both total and
+        remaining counts.
+
+        :param restrictions: conditions to restrict key_source
+        :param display: if True, log the progress
         :return: (remaining, total) -- numbers of tuples to be populated
         """
         todo = self._jobs_to_do(restrictions)
-        total = len(todo)
-        remaining = len(todo - self.target)
+
+        # Get primary key attributes from key_source for join condition
+        # These are the "job keys" - the granularity at which populate() works
+        pk_attrs = todo.primary_key
+        assert pk_attrs, "key_source must have a primary key"
+
+        # Find common attributes between key_source and self for the join
+        # This handles cases where self has additional PK attributes
+        common_attrs = [attr for attr in pk_attrs if attr in self.heading.names]
+
+        if not common_attrs:
+            # No common attributes - fall back to two-query method
+            total = len(todo)
+            remaining = len(todo - self)
+        else:
+            # Build a single query that computes both total and remaining
+            # Using LEFT JOIN with COUNT(DISTINCT) to handle 1:many relationships
+            todo_sql = todo.make_sql()
+            target_sql = self.make_sql()
+
+            # Build join condition on common attributes
+            join_cond = " AND ".join(f"`$ks`.`{attr}` = `$tgt`.`{attr}`" for attr in common_attrs)
+
+            # Build DISTINCT key expression for counting unique jobs
+            # Use CONCAT for composite keys to create a single distinct value
+            if len(pk_attrs) == 1:
+                distinct_key = f"`$ks`.`{pk_attrs[0]}`"
+                null_check = f"`$tgt`.`{common_attrs[0]}`"
+            else:
+                distinct_key = "CONCAT_WS('|', {})".format(", ".join(f"`$ks`.`{attr}`" for attr in pk_attrs))
+                null_check = f"`$tgt`.`{common_attrs[0]}`"
+
+            # Single aggregation query:
+            # - COUNT(DISTINCT key) gives total unique jobs in key_source
+            # - Remaining = jobs where no matching target row exists
+            sql = f"""
+                SELECT
+                    COUNT(DISTINCT {distinct_key}) AS total,
+                    COUNT(DISTINCT CASE WHEN {null_check} IS NULL THEN {distinct_key} END) AS remaining
+                FROM ({todo_sql}) AS `$ks`
+                LEFT JOIN ({target_sql}) AS `$tgt` ON {join_cond}
+            """
+
+            result = self.connection.query(sql).fetchone()
+            total, remaining = result
+
         if display:
             logger.info(
                 "%-20s" % self.__class__.__name__
@@ -585,7 +626,7 @@ def progress(self, *restrictions, display=False):
     def _has_job_metadata_attrs(self):
         """Check if table has hidden job metadata columns."""
         # Access _attributes directly to include hidden attributes
-        all_attrs = self.target.heading._attributes
+        all_attrs = self.heading._attributes
         return all_attrs is not None and "_job_start_time" in all_attrs
 
     def _update_job_metadata(self, key, start_time, duration, version):
@@ -600,9 +641,9 @@ def _update_job_metadata(self, key, start_time, duration, version):
         """
         from .condition import make_condition
 
-        pk_condition = make_condition(self.target, key, set())
+        pk_condition = make_condition(self, key, set())
         self.connection.query(
-            f"UPDATE {self.target.full_table_name} SET "
+            f"UPDATE {self.full_table_name} SET "
             "`_job_start_time`=%s, `_job_duration`=%s, `_job_version`=%s "
             f"WHERE {pk_condition}",
             args=(start_time, duration, version[:64] if version else ""),
diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py
@@ -248,3 +248,167 @@ def check_migration_status(schema: Schema) -> dict:
         "pending": sum(1 for c in columns if c["needs_migration"]),
         "columns": columns,
     }
+
+
+# =============================================================================
+# Job Metadata Migration
+# =============================================================================
+
+# Hidden job metadata columns added by config.jobs.add_job_metadata
+JOB_METADATA_COLUMNS = [
+    ("_job_start_time", "datetime(3) DEFAULT NULL"),
+    ("_job_duration", "float DEFAULT NULL"),
+    ("_job_version", "varchar(64) DEFAULT ''"),
+]
+
+
+def _get_existing_columns(connection, database: str, table_name: str) -> set[str]:
+    """Get set of existing column names for a table."""
+    result = connection.query(
+        """
+        SELECT COLUMN_NAME
+        FROM information_schema.COLUMNS
+        WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
+        """,
+        args=(database, table_name),
+    )
+    return {row[0] for row in result.fetchall()}
+
+
+def _is_autopopulated_table(table_name: str) -> bool:
+    """Check if a table name indicates a Computed or Imported table."""
+    # Computed tables start with __ (but not part tables which have __ in middle)
+    # Imported tables start with _ (but not __)
+    if table_name.startswith("__"):
+        # Computed table if no __ after the prefix
+        return "__" not in table_name[2:]
+    elif table_name.startswith("_"):
+        # Imported table
+        return True
+    return False
+
+
+def add_job_metadata_columns(target, dry_run: bool = True) -> dict:
+    """
+    Add hidden job metadata columns to existing Computed/Imported tables.
+
+    This migration utility adds the hidden columns (_job_start_time, _job_duration,
+    _job_version) to tables that were created before config.jobs.add_job_metadata
+    was enabled.
+
+    Args:
+        target: Either a table class/instance (dj.Computed or dj.Imported) or
+                a Schema object. If a Schema, all Computed/Imported tables in
+                the schema will be processed.
+        dry_run: If True (default), only preview changes without applying.
+
+    Returns:
+        Dict with keys:
+            - tables_analyzed: Number of tables checked
+            - tables_modified: Number of tables that were/would be modified
+            - columns_added: Total columns added across all tables
+            - details: List of dicts with per-table information
+
+    Example:
+        >>> import datajoint as dj
+        >>> from datajoint.migrate import add_job_metadata_columns
+        >>>
+        >>> # Preview migration for a single table
+        >>> result = add_job_metadata_columns(MyComputedTable, dry_run=True)
+        >>> print(f"Would add {result['columns_added']} columns")
+        >>>
+        >>> # Apply migration to all tables in a schema
+        >>> result = add_job_metadata_columns(schema, dry_run=False)
+        >>> print(f"Modified {result['tables_modified']} tables")
+
+    Note:
+        - Only Computed and Imported tables are modified (not Manual, Lookup, or Part tables)
+        - Existing rows will have NULL values for _job_start_time and _job_duration
+        - Future populate() calls will fill in metadata for new rows
+        - This does NOT retroactively populate metadata for existing rows
+    """
+    from .schemas import Schema
+    from .table import Table
+
+    result = {
+        "tables_analyzed": 0,
+        "tables_modified": 0,
+        "columns_added": 0,
+        "details": [],
+    }
+
+    # Determine tables to process
+    if isinstance(target, Schema):
+        schema = target
+        # Get all user tables in the schema
+        tables_query = """
+            SELECT TABLE_NAME
+            FROM information_schema.TABLES
+            WHERE TABLE_SCHEMA = %s
+            AND TABLE_TYPE = 'BASE TABLE'
+            AND TABLE_NAME NOT LIKE '~%%'
+        """
+        table_names = [row[0] for row in schema.connection.query(tables_query, args=(schema.database,)).fetchall()]
+        tables_to_process = [
+            (schema.database, name, schema.connection) for name in table_names if _is_autopopulated_table(name)
+        ]
+    elif isinstance(target, type) and issubclass(target, Table):
+        # Table class
+        instance = target()
+        tables_to_process = [(instance.database, instance.table_name, instance.connection)]
+    elif isinstance(target, Table):
+        # Table instance
+        tables_to_process = [(target.database, target.table_name, target.connection)]
+    else:
+        raise DataJointError(f"target must be a Table class, Table instance, or Schema, got {type(target)}")
+
+    for database, table_name, connection in tables_to_process:
+        result["tables_analyzed"] += 1
+
+        # Skip non-autopopulated tables
+        if not _is_autopopulated_table(table_name):
+            continue
+
+        # Check which columns need to be added
+        existing_columns = _get_existing_columns(connection, database, table_name)
+        columns_to_add = [(name, definition) for name, definition in JOB_METADATA_COLUMNS if name not in existing_columns]
+
+        if not columns_to_add:
+            result["details"].append(
+                {
+                    "table": f"{database}.{table_name}",
+                    "status": "already_migrated",
+                    "columns_added": 0,
+                }
+            )
+            continue
+
+        # Generate and optionally execute ALTER statements
+        table_detail = {
+            "table": f"{database}.{table_name}",
+            "status": "migrated" if not dry_run else "pending",
+            "columns_added": len(columns_to_add),
+            "sql_statements": [],
+        }
+
+        for col_name, col_definition in columns_to_add:
+            sql = f"ALTER TABLE `{database}`.`{table_name}` ADD COLUMN `{col_name}` {col_definition}"
+            table_detail["sql_statements"].append(sql)
+
+            if not dry_run:
+                try:
+                    connection.query(sql)
+                    logger.info(f"Added column {col_name} to {database}.{table_name}")
+                except Exception as e:
+                    logger.error(f"Failed to add column {col_name} to {database}.{table_name}: {e}")
+                    table_detail["status"] = "error"
+                    table_detail["error"] = str(e)
+                    raise DataJointError(f"Migration failed: {e}") from e
+            else:
+                logger.info(f"Would add column {col_name} to {database}.{table_name}")
+
+        result["tables_modified"] += 1
+        result["columns_added"] += len(columns_to_add)
+        result["details"].append(table_detail)
+
+    return result
diff --git a/src/datajoint/version.py b/src/datajoint/version.py
@@ -1,4 +1,4 @@
 # version bump auto managed by Github Actions:
 # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit)
 # manually set this version will be eventually overwritten by the above actions
-__version__ = "2.0.0a11"
+__version__ = "2.0.0a12"