DataFrame: Add Attribute Columns

ax3l · ax3l · commit b6c49620ba33 · 2025-12-03T17:20:16.000-08:00
Optionally add particle species attributes
as extra columns. This is super useful when dealing with
openPMD extensions or custom attributes, e.g., for accelerator
physics. In the latter case, we store reference particle
information on the particle species group (changes per
iteration/snapshot).
diff --git a/docs/source/analysis/pandas.rst b/docs/source/analysis/pandas.rst
@@ -50,6 +50,17 @@ One can also combine all iterations in a single dataframe like this:
    # like before but with a new column "iteration" and all particles
    print(df)
 
+Additionally, one can add additional openPMD particle species attributes, e.g.,
+from the `ED-PIC <https://github.com/openPMD/openPMD-standard/blob/1.1.0/EXT_ED-PIC.md#particle-records-macroparticles>`__ extension
+or `custom code properties <https://impactx.readthedocs.io/en/25.11/dataanalysis/dataanalysis.html#additional-beam-attributes>`__
+as extra dataframe columns:
+
+.. code-block:: python
+
+   df = s.to_df("electrons", attributes=["s_ref"])
+
+   # like before but with a new column "s_ref"
+   print(df)
 
 .. _analysis-pandas-ascii:
 
diff --git a/examples/11_particle_dataframe.py b/examples/11_particle_dataframe.py
@@ -38,26 +38,27 @@
     s = io.Series("../samples/git-sample/data%T.h5", io.Access.read_only)
     electrons = s.snapshots()[400].particles["electrons"]
 
-    # all particles
-    df = electrons.to_df()
+    # all particles, extra column for "particleShape" attribute
+    #                (from ED-PIC extension)
+    df = electrons.to_df(attributes=["particleShape"])
     print(type(df) is pd.DataFrame)
     print(df)
 
     # only first 100 particles
-    df = electrons.to_df(np.s_[:100])
+    df = electrons.to_df(slice=np.s_[:100])
     print(df)
 
     # all particles over all steps
-    df = s.to_df("electrons")
+    df = s.to_df("electrons", attributes=["particleShape"])
     print(df)
 
     if found_cudf:
         # all particles - to GPU
-        cdf = cudf.from_pandas(electrons.to_df())
+        cdf = cudf.from_pandas(electrons.to_df(attributes=["particleShape"]))
         print(cdf)
 
         # all particles over all steps - to GPU
-        cdf = s.to_cudf("electrons")
+        cdf = s.to_cudf("electrons", attributes=["particleShape"])
         print(cdf)
 
     # Particles
@@ -67,7 +68,7 @@
         # pickle capabilities, so we test this here:
         dask.config.set(scheduler='processes')
 
-        df = electrons.to_dask()
+        df = electrons.to_dask(attributes=["particleShape"])
         print(df)
 
         # check chunking of a variable
diff --git a/src/binding/python/openpmd_api/DaskDataFrame.py b/src/binding/python/openpmd_api/DaskDataFrame.py
@@ -8,19 +8,22 @@
 import numpy as np
 
 
-def read_chunk_to_df(species, chunk):
+def read_chunk_to_df(species, chunk, attributes=None):
     stride = np.s_[chunk.offset[0]:chunk.offset[0]+chunk.extent[0]]
-    return species.to_df(stride)
+    return species.to_df(attributes=attributes, slice=stride)
 
 
-def particles_to_daskdataframe(particle_species):
+def particles_to_daskdataframe(particle_species, attributes=None):
     """
     Load all records of a particle species into a Dask DataFrame.
 
     Parameters
     ----------
     particle_species : openpmd_api.ParticleSpecies
         A ParticleSpecies class in openPMD-api.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
 
     Returns
     -------
@@ -83,7 +86,9 @@ def particles_to_daskdataframe(particle_species):
 
     # merge DataFrames
     dfs = [
-        delayed(read_chunk_to_df)(particle_species, chunk) for chunk in chunks
+        delayed(read_chunk_to_df)(
+            particle_species, chunk=chunk, attributes=attributes
+        ) for chunk in chunks
     ]
     df = dd.from_delayed(dfs)
 
diff --git a/src/binding/python/openpmd_api/DataFrame.py b/src/binding/python/openpmd_api/DataFrame.py
@@ -10,14 +10,17 @@
 import numpy as np
 
 
-def particles_to_dataframe(particle_species, slice=None):
+def particles_to_dataframe(particle_species, attributes=None, slice=None):
     """
     Load all records of a particle species into a Pandas DataFrame.
 
     Parameters
     ----------
     particle_species : openpmd_api.ParticleSpecies
         A ParticleSpecies class in openPMD-api.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
     slice : np.s_, optional
         A numpy slice that can be used to load only a sub-selection of
         particles.
@@ -69,14 +72,18 @@ def particles_to_dataframe(particle_species, slice=None):
 
     df = pd.DataFrame(columns)
 
+    if attributes is not None:
+        for attribute in attributes:
+            df[attribute] = particle_species.get_attribute(attribute)
+
     # set a header for the first column (row index)
     #   note: this is NOT the particle id
     df.index.name = "row"
 
     return df
 
 
-def iterations_to_dataframe(series, species_name):
+def iterations_to_dataframe(series, species_name, attributes=None):
     """
     Load all iterations of a particle species into a Pandas DataFrame.
 
@@ -86,6 +93,9 @@ def iterations_to_dataframe(series, species_name):
         A Series class in openPMD-api.
     species_name : string
         The name of a particle species.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
 
     Returns
     -------
@@ -115,7 +125,7 @@ def iterations_to_dataframe(series, species_name):
         (
             iteration
             .particles[species_name]
-            .to_df()
+            .to_df(attributes=attributes)
             .assign(iteration=i)
             for i, iteration in series.snapshots().items()
         ),
@@ -126,7 +136,7 @@ def iterations_to_dataframe(series, species_name):
     return df
 
 
-def iterations_to_cudf(series, species_name):
+def iterations_to_cudf(series, species_name, attributes=None):
     """
     Load all iterations of a particle species into a cuDF DataFrame.
 
@@ -136,6 +146,9 @@ def iterations_to_cudf(series, species_name):
         A Series class in openPMD-api.
     species_name : string
         The name of a particle species.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
 
     Returns
     -------
@@ -172,7 +185,7 @@ def iterations_to_cudf(series, species_name):
             cudf.from_pandas(
                 iteration
                 .particles[species_name]
-                .to_df()
+                .to_df(attributes=attributes)
                 .assign(iteration=i)
             )
             for i, iteration in series.snapshots().items()