daphne-project · EricBenschneider · Nov 28, 2024 · Nov 29, 2024 · Jan 11, 2025 · Feb 4, 2025
diff --git a/daphne-opt/daphne-opt.cpp b/daphne-opt/daphne-opt.cpp
@@ -36,17 +36,14 @@ int main(int argc, char **argv) {
     mlir::daphne::registerDaphnePasses();
 
     mlir::DialectRegistry registry;
-    registry.insert<mlir::daphne::DaphneDialect, mlir::arith::ArithDialect,
-                    mlir::func::FuncDialect, mlir::scf::SCFDialect,
-                    mlir::LLVM::LLVMDialect, mlir::AffineDialect,
-                    mlir::memref::MemRefDialect, mlir::linalg::LinalgDialect,
-                    mlir::math::MathDialect>();
+    registry.insert<mlir::daphne::DaphneDialect, mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                    mlir::scf::SCFDialect, mlir::LLVM::LLVMDialect, mlir::AffineDialect, mlir::memref::MemRefDialect,
+                    mlir::linalg::LinalgDialect, mlir::math::MathDialect>();
     // Add the following to include *all* MLIR Core dialects, or selectively
     // include what you need like above. You only need to register dialects that
     // will be *parsed* by the tool, not the one generated
     // registerAllDialects(registry);
 
-    return mlir::asMainReturnCode(mlir::MlirOptMain(
-        argc, argv, "Standalone DAPHNE optimizing compiler driver\n",
-        registry));
+    return mlir::asMainReturnCode(
+        mlir::MlirOptMain(argc, argv, "Standalone DAPHNE optimizing compiler driver\n", registry));
 }
diff --git a/doc/FileMetaDataFormat.md b/doc/FileMetaDataFormat.md
@@ -18,15 +18,23 @@ limitations under the License.
 
 Reading and writing (meta) data in Daphne.
 
-When loading data with ``read()`` in a DaphneDSL script, the system expects a file with the same file name in the same
+When loading non csv data with ``read()`` in a DaphneDSL script, the system expects a file with the same file name in the same
 directory as the data file with an additional extension ``.meta``. This file contains a description of meta data stored
-in JSON format.
+in JSON format. 
 
 There are two slightly varying ways of specifying meta data depending on whether there is a schema for the columns (e.g.,
 a data frame - the corresponding C++ type is the Frame class) or not (this data can currently (as of version 0.1) be
 loaded as `DenseMatrix<VT>` or `CSRMatrix<VT>` where `VT` is the value type template parameter).
 
 If data is written from a DaphneDSL script via ``write()``, the meta data file will be written to the corresponding ``filename.meta``.
+Below is an updated version of the documentation in the same style, now including details on the new meta data generation mechanism and its integration with the read/write functionalities.
+
+## Generation of meta data
+
+When the source file is in csv format and no meta data file is provided, the meta data file can be inferred from the data itself.
+But please be aware that the initial read time will be more than doubled compared to providing a meta data file.
+To mitigate that meta data generation is based on sampling rows from the data file, which can lead to incomplete meta data.
+The exact number can be changed in the user configuration via `numberOfSampleRows`.
 
 ## Currently supported JSON fields
 

diff --git a/evaluation/build-charts.py b/evaluation/build-charts.py
@@ -0,0 +1,131 @@
+import glob
+import re
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Folder where logs are stored.
+results_dir = './results'
+
+# This function extracts dimensions (number of rows and columns) from the filename.
+# e.g. "frame_100000r_20c_MIXED.csv" -> (100000,20)
+def extract_dims(filename):
+    m = re.search(r'(\d+)r_(\d+)c', filename)
+    if m:
+        rows = int(m.group(1))
+        cols = int(m.group(2))
+        return rows, cols
+    else:
+        return None, None
+
+# This function extracts the overall data type from the filename.
+# It considers the main type (matrix if the filename starts with "matrix_",
+# otherwise frame) combined with a subtype (mixed, str, float, etc.).
+def extract_data_type(filename):
+    base = os.path.basename(filename)
+    main_type = "matrix" if base.startswith("matrix_") else "frame"
+    m = re.search(r'(mixed|str|float|rep|strdiff|fixedstr|number)', base, re.IGNORECASE)
+    subtype = m.group(1).lower() if m else "unknown"
+    # Map fixedstr and strdiff to "str" for comparison purposes
+    if subtype in ["fixedstr", "strdiff"]:
+        subtype = "str"
+    return f"{main_type}_{subtype}"
+
+# Load CSV logs for each experiment.
+def load_log(experiment, pattern):
+    # We assume files are named like evaluation_results_*_{experiment}.csv in the results folder.
+    files = glob.glob(os.path.join(results_dir, f"evaluation_results_*_{experiment}.csv"))
+    dfs = []
+    for f in files:
+        # The CSV already has a header:
+        # CSVFile,Experiment,Trial,ReadTime,GenerateTime,meta dataReadTime,StartupSeconds,ParsingSeconds,CompilationSeconds,ExecutionSeconds,TotalSeconds
+        df = pd.read_csv(f)
+        # Extract dimensions and add them as columns.
+        dims = df['CSVFile'].apply(lambda x: extract_dims(x))
+        df['Rows'] = dims.apply(lambda x: x[0] if x else np.nan)
+        df['Cols'] = dims.apply(lambda x: x[1] if x else np.nan)
+        # Compute a size measure (for example, total cells)
+        df['Size'] = df['Rows'] * df['Cols']
+        # Extract a combined data type (main type and subtype).
+        df['DataType'] = df['CSVFile'].apply(extract_data_type)
+        dfs.append(df)
+    if dfs:
+        return pd.concat(dfs, ignore_index=True)
+    else:
+        return pd.DataFrame()
+
+# Load the three experiment logs.
+df_normal = load_log("normal", "evaluation_results_*_normal.csv")
+df_create = load_log("create", "evaluation_results_*_create.csv")
+
+# Compute average timings per dataset (grouped by CSVFile, Size, Rows, Cols, and DataType)
+def aggregate_log(df):
+    # Convert timing fields to numeric type.
+    cols_to_numeric = ['ReadTime', 'GenerateTime',
+                       'StartupSeconds', 'ParsingSeconds', 'CompilationSeconds',
+                       'ExecutionSeconds', 'TotalSeconds']
+    for col in cols_to_numeric:
+        df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Group including DataType so that it is preserved in the aggregation.
+    return df.groupby(['CSVFile', 'Size', 'Rows', 'Cols', 'DataType'])[cols_to_numeric].mean().reset_index()
+
+agg_normal = aggregate_log(df_normal)
+agg_create = aggregate_log(df_create)
+
+# Plot 1: Overall read time comparison for Normal, First (Create) and Second (Opt) reads.
+plt.figure(figsize=(10,6))
+agg_normal = agg_normal.sort_values("Size")
+agg_create = agg_create.sort_values("Size")
+
+plt.plot(agg_normal["Size"], agg_normal["ReadTime"], marker="o", label="Normal Read")
+plt.plot(agg_create["Size"], agg_create["ReadTime"], marker="s", label="read with generated meta data (Overall)")
+plt.xlabel("Dataset Size (Rows x Cols)")
+plt.ylabel("Overall Read Time (seconds)")
+plt.title("Overall Read Time vs Dataset Size")
+plt.xscale("log")  # Added: logarithmic scale on x-axis.
+plt.yscale("log")  # Added: logarithmic scale on y-axis.
+plt.legend()
+plt.grid(True, which="both", ls="--")
+plt.tight_layout()
+plt.savefig("fig/overall_read_time.png")
+plt.close()
+
+# Plot 2: Three read comparison per dataset size for each data type.
+unique_types = agg_normal["DataType"].unique()
+for dt in unique_types:
+    sub_normal = agg_normal[agg_normal["DataType"] == dt].sort_values("Size")
+    sub_create = agg_create[agg_create["DataType"] == dt].sort_values("Size")
+
+    plt.figure(figsize=(10,6))
+    plt.plot(sub_normal["Size"], sub_normal["ReadTime"], marker="o", label="Normal Read")
+    plt.plot(sub_create["Size"], sub_create["ReadTime"], marker="s", label="read with generated meta data (Overall)")
+    plt.xlabel("Dataset Size (Rows x Cols)")
+    plt.ylabel("Overall Read Time (seconds)")
+    plt.title(f"Overall Read Time vs Dataset Size for {dt}")
+    plt.xscale("log")  # Added: logarithmic scale on x-axis.
+    plt.yscale("log")  # Added: logarithmic scale on y-axis.
+    plt.legend()
+    plt.grid(True, which="both", ls="--")
+    plt.tight_layout()
+    plt.savefig(f"fig/overall_read_time_{dt}.png")
+    plt.close()
+
+# Plot 3: Breakdown for First Read (Create) – Stacked bar: Overall Read Time and meta data Write Time.
+if not agg_create.empty:
+    ind = np.arange(len(agg_create))
+    width = 0.6
+    fig, ax = plt.subplots(figsize=(10,6))
+    p1 = ax.bar(ind, agg_create["ReadTime"], width, label="Overall Read Time")
+    p2 = ax.bar(ind, agg_create["GenerateTime"], width, bottom=agg_create["ReadTime"], label="meta data generation Time")
+    ax.set_xticks(ind)
+    ax.set_xticklabels(agg_create["CSVFile"], rotation=45, ha="right")
+    ax.set_ylabel("Time (seconds)")
+    ax.set_title("First Read Breakdown (Create): Read vs. Write meta data")
+    ax.legend()
+    plt.tight_layout()
+    plt.savefig("fig/create_read_breakdown.png")
+    plt.close()
+
+
+print("Charts generated and saved as PNG files.")
diff --git a/evaluation/data/frame_100000r_20c_NUMBER.csv.meta b/evaluation/data/frame_100000r_20c_NUMBER.csv.meta
@@ -0,0 +1,86 @@
+{
+    "numRows": 100000,
+    "numCols": 20,
+    "schema": [
+        {
+            "label": "col_0_uint8",
+            "valueType": "ui8"
+        },
+        {
+            "label": "col_1_int8",
+            "valueType": "si8"
+        },
+        {
+            "label": "col_2_uint32",
+            "valueType": "ui32"
+        },
+        {
+            "label": "col_3_int32",
+            "valueType": "si32"
+        },
+        {
+            "label": "col_4_uint64",
+            "valueType": "ui64"
+        },
+        {
+            "label": "col_5_int64",
+            "valueType": "si64"
+        },
+        {
+            "label": "col_6_float32",
+            "valueType": "f32"
+        },
+        {
+            "label": "col_7_float64",
+            "valueType": "f64"
+        },
+        {
+            "label": "col_8_uint8",
+            "valueType": "ui8"
+        },
+        {
+            "label": "col_9_int8",
+            "valueType": "si8"
+        },
+        {
+            "label": "col_10_uint32",
+            "valueType": "ui32"
+        },
+        {
+            "label": "col_11_int32",
+            "valueType": "si32"
+        },
+        {
+            "label": "col_12_uint64",
+            "valueType": "ui64"
+        },
+        {
+            "label": "col_13_int64",
+            "valueType": "si64"
+        },
+        {
+            "label": "col_14_float32",
+            "valueType": "f32"
+        },
+        {
+            "label": "col_15_float64",
+            "valueType": "f64"
+        },
+        {
+            "label": "col_16_uint8",
+            "valueType": "ui8"
+        },
+        {
+            "label": "col_17_int8",
+            "valueType": "si8"
+        },
+        {
+            "label": "col_18_uint32",
+            "valueType": "ui32"
+        },
+        {
+            "label": "col_19_int32",
+            "valueType": "si32"
+        }
+    ]
+}
diff --git a/evaluation/data/frame_10000r_20c_NUMBER.csv.meta b/evaluation/data/frame_10000r_20c_NUMBER.csv.meta
@@ -0,0 +1,86 @@
+{
+    "numRows": 10000,
+    "numCols": 20,
+    "schema": [
+        {
+            "label": "col_0_uint8",
+            "valueType": "ui8"
+        },
+        {
+            "label": "col_1_int8",
+            "valueType": "si8"
+        },
+        {
+            "label": "col_2_uint32",
+            "valueType": "ui32"
+        },
+        {
+            "label": "col_3_int32",
+            "valueType": "si32"
+        },
+        {
+            "label": "col_4_uint64",
+            "valueType": "ui64"
+        },
+        {
+            "label": "col_5_int64",
+            "valueType": "si64"
+        },
+        {
+            "label": "col_6_float32",
+            "valueType": "f32"
+        },
+        {
+            "label": "col_7_float64",
+            "valueType": "f64"
+        },
+        {
+            "label": "col_8_uint8",
+            "valueType": "ui8"
+        },
+        {
+            "label": "col_9_int8",
+            "valueType": "si8"
+        },
+        {
+            "label": "col_10_uint32",
+            "valueType": "ui32"
+        },
+        {
+            "label": "col_11_int32",
+            "valueType": "si32"
+        },
+        {
+            "label": "col_12_uint64",
+            "valueType": "ui64"
+        },
+        {
+            "label": "col_13_int64",
+            "valueType": "si64"
+        },
+        {
+            "label": "col_14_float32",
+            "valueType": "f32"
+        },
+        {
+            "label": "col_15_float64",
+            "valueType": "f64"
+        },
+        {
+            "label": "col_16_uint8",
+            "valueType": "ui8"
+        },
+        {
+            "label": "col_17_int8",
+            "valueType": "si8"
+        },
+        {
+            "label": "col_18_uint32",
+            "valueType": "ui32"
+        },
+        {
+            "label": "col_19_int32",
+            "valueType": "si32"
+        }
+    ]
+}