imubit
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎AUTHORS.md‎
Lines changed: 1 addition & 1 deletion b/‎AUTHORS.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.cfg‎
Lines changed: 2 additions & 0 deletions b/‎setup.cfg‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/data_agent/abstract_connector.py‎
Lines changed: 8 additions & 1 deletion b/‎src/data_agent/abstract_connector.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/data_agent/exceptions.py‎
Lines changed: 4 additions & 0 deletions b/‎src/data_agent/exceptions.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/data_agent/history_harvester.py‎
Lines changed: 135 additions & 0 deletions b/‎src/data_agent/history_harvester.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎src/data_agent/linux/config_default.yaml‎
Lines changed: 5 additions & 0 deletions b/‎src/data_agent/linux/config_default.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/data_agent/msg_packer.py‎
Lines changed: 111 additions & 0 deletions b/‎src/data_agent/msg_packer.py‎
Lines changed: 111 additions & 0 deletions
@@ -85,7 +85,7 @@ jobs:
       - name: Generate coverage report
         run: pipx run coverage lcov -o coverage.lcov
       - name: Upload partial coverage report
-        uses: coverallsapp/github-action@master
+        uses: coverallsapp/github-action@v2
         with:
           path-to-lcov: coverage.lcov
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Finalize coverage report
-        uses: coverallsapp/github-action@master
+        uses: coverallsapp/github-action@v2
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           parallel-finished: true
 
@@ -8,7 +8,7 @@
 *.orig
 *.log
 *.pot
-__pycache__/*
+__pycache__/
 .cache/*
 .*.swp
 */.ipynb_checkpoints/*
 
@@ -1,3 +1,3 @@
 # Contributors
 
-* Meir Tseitlin [meir@imubit.com](mailto:meir@imubit.com)
+* Meir Tseitlin [opensource@imubit.com](mailto:opensource@imubit.com)
@@ -32,6 +32,8 @@ package_dir =
 install_requires =
     dynaconf
     pandas
+    msgpack
+    zstandard
     aiomisc
     aiodebug
     apscheduler
 
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from enum import IntEnum
 from functools import wraps
 from typing import Union
 
@@ -7,7 +8,7 @@
 from .exceptions import ConnectionNotActive, TagsGroupNotFound
 
 
-class SupportedOperation:
+class SupportedOperation(IntEnum):
     READ_TAG_VALUE = 1
     WRITE_TAG_VALUE = 2
     READ_TAG_PERIOD = 3
@@ -20,6 +21,12 @@ class SupportedOperation:
     DELETE_TAG = 10
 
 
+class HistDataFormat(IntEnum):
+    DATAFRAME = (1,)
+    SERIES_LIST = (2,)
+    DICTIONARY = 3
+
+
 STANDARD_ATTRIBUTES = {
     "Name": {"Type": "str", "Name": "Tag Name"},
     "Type": {"Type": "str", "Name": "Data Type"},
 
@@ -76,3 +76,7 @@ class SafetyErrorManipulateOutsideOfRateBound(Exception):
 
 class DaqJobAlreadyExists(Exception):
     pass
+
+
+class HistoryHarvesterJobAlreadyExists(Exception):
+    pass
@@ -0,0 +1,135 @@
+import datetime as dt
+import logging
+import time as tm
+
+from apscheduler.executors.pool import ThreadPoolExecutor
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+
+from data_agent.exceptions import HistoryHarvesterJobAlreadyExists
+from data_agent.msg_packer import encode_dataframe
+
+log = logging.getLogger(__name__)
+
+
+class HistoryHarvester:
+    def __init__(self, connection_manager, broker):
+        self.connection_manager = connection_manager
+        self.broker = broker
+
+        self._thread_pool_executor = ThreadPoolExecutor(max_workers=20)
+        self._scheduler = AsyncIOScheduler(thread_pool=self._thread_pool_executor)
+        self._scheduler.start()
+
+    async def _delivery_job_func(
+        self,
+        job_id,
+        conn,
+        tags,
+        first_timestamp,
+        last_timestamp,
+        time_frequency,
+        batch_size,
+        iteration,
+    ):
+        try:
+            start_time = tm.time()
+
+            next_period_end = min(last_timestamp, first_timestamp + batch_size)
+
+            df = conn.read_tag_values_period(
+                tags=tags,
+                first_timestamp=first_timestamp,
+                last_timestamp=next_period_end,
+                time_frequency=time_frequency,
+            )
+
+            read_time = tm.time() - start_time
+
+            if df.empty:
+                log.warning(
+                    f"No data read for job '{job_id}' for period {first_timestamp} - {next_period_end}"
+                )
+
+            else:  # Publish data
+                headers = {
+                    "data_category": "historical",
+                    "connection": conn.name,
+                    "job_id": job_id,
+                    "batch_num": iteration,
+                }
+
+                payload = encode_dataframe(df)
+
+                log.debug(
+                    f"(#{iteration}): Job {job_id}: "
+                    f"Data publish: read time={read_time:.2f}s), {len(df)} samples, "
+                    f"period: {first_timestamp} - {next_period_end}"
+                )
+                self.broker.publish_data(payload, headers=headers)
+
+            if next_period_end < last_timestamp:
+                # Reschedule next run
+                self._scheduler.add_job(
+                    func=self._delivery_job_func,
+                    trigger="date",
+                    # next_run_time=dt.datetime.now(),
+                    coalesce=True,  # Always run once
+                    id=job_id,
+                    max_instances=2,
+                    replace_existing=True,
+                    args=[
+                        job_id,
+                        conn,
+                        tags,
+                        next_period_end,
+                        last_timestamp,
+                        time_frequency,
+                        batch_size,
+                        iteration + 1,
+                    ],
+                )
+
+        except Exception as e:
+            log.exception(f'Exception in history harvester job "{job_id}" - {e}')
+
+    def create_delivery_job(
+        self,
+        job_id: str,
+        conn_name: str,
+        tags: list,
+        first_timestamp: dt.datetime,
+        last_timestamp: dt.datetime,
+        time_frequency: dt.timedelta,
+        batch_size: dt.timedelta = None,
+        progress_callback=None,
+    ):
+        # order tags alphabetically
+        tags.sort()
+
+        existing_job = self._scheduler.get_job(job_id)
+        if existing_job:
+            raise HistoryHarvesterJobAlreadyExists(
+                f"History loader Job {job_id} already exists."
+            )
+
+        conn = self.connection_manager.connection(conn_name, check_enabled=False)
+
+        self._scheduler.add_job(
+            func=self._delivery_job_func,
+            trigger="date",
+            # next_run_time=dt.datetime.now(),
+            coalesce=True,  # Always run once
+            id=job_id,
+            max_instances=2,
+            replace_existing=True,
+            args=[
+                job_id,
+                conn,
+                tags,
+                first_timestamp,
+                last_timestamp,
+                time_frequency,
+                batch_size,
+                0,
+            ],
+        )
@@ -160,6 +160,11 @@ log: # standard logging dictConfig
       level: 'DEBUG'
       propagate: False
 
+    data_agent.history_harvester:
+      handlers: ['console', 'amqp']
+      level: 'DEBUG'
+      propagate: False
+
     data_agent.connection_manager:
       handlers: ['console', 'amqp']
       level: 'DEBUG'
 
@@ -0,0 +1,111 @@
+import msgpack
+import numpy as np
+import pandas as pd
+import zstandard as zstd
+
+
+def encode_dataframe(df, ts_unit="s", zstd_level=10):
+    # 1) timestamps
+    ts = df.index.view("int64")
+    if ts_unit != "ns":
+        factor = {"s": 1_000_000_000, "ms": 1_000_000, "us": 1_000, "ns": 1}[ts_unit]
+        ts = (ts // factor).astype("int64")
+    ts_blob = ts.tobytes()
+
+    # 2) split fixed vs. object columns
+    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
+    num_cols = [c for c in df.columns if c not in obj_cols]
+
+    # 2a) fixed‐dtype blob
+    if num_cols:
+        rec = df[num_cols].to_records(index=False)
+        num_blob = rec.tobytes()
+        num_descr = rec.dtype.descr
+    else:
+        num_blob = b""
+        num_descr = []
+
+    # 2b) object‐dtype data (simple Python lists)
+    obj_data = {c: df[c].tolist() for c in obj_cols}
+
+    # 3) pack into ExtTypes + one metadata map
+    p = msgpack.Packer(use_bin_type=True, strict_types=True)
+    parts = [
+        p.pack(msgpack.ExtType(0, ts_blob)),
+        p.pack(msgpack.ExtType(1, num_blob)),
+        # Ext code 2 carries the already‐msgpacked object data blob:
+        p.pack(msgpack.ExtType(2, msgpack.packb(obj_data, use_bin_type=True))),
+    ]
+
+    # build metadata — now include the original columns order
+    meta = {
+        "ts_unit": ts_unit,
+        "num_descr": [list(x) for x in num_descr],
+        "num_cols": num_cols,
+        "obj_cols": obj_cols,
+        "orig_cols": df.columns.tolist(),
+        "index_name": df.index.name,
+    }
+
+    parts.append(p.pack(meta))
+    raw = b"".join(parts)
+    return zstd.ZstdCompressor(level=zstd_level).compress(raw)
+
+
+def decode_payload(blob):
+    # 1) decompress
+    raw = zstd.ZstdDecompressor().decompress(blob)
+
+    # 2) ext_hook to pull out our three ExtTypes
+    def ext_hook(code, data):
+        if code == 0:
+            # timestamps
+            return np.frombuffer(data, dtype="int64")
+        if code == 1:
+            # numeric blob
+            return data
+        if code == 2:
+            # object blob
+            return data
+        return msgpack.ExtType(code, data)
+
+    # 3) unpack in sequence
+    unpacker = msgpack.Unpacker(ext_hook=ext_hook, raw=False)
+    unpacker.feed(raw)
+    ts_arr = next(unpacker)
+    num_blob = next(unpacker)
+    obj_blob = next(unpacker)
+    meta = next(unpacker)
+
+    # 4) rebuild timestamps
+    factor = {"s": 1_000_000_000, "ms": 1_000_000, "us": 1_000, "ns": 1}[
+        meta["ts_unit"]
+    ]
+    idx = pd.to_datetime(ts_arr * factor)
+    idx.name = meta["index_name"]
+
+    # 5) rebuild fixed‐dtype DataFrame
+    num_cols = meta["num_cols"]
+    if num_cols:
+        dtype_descr = [tuple(x) for x in meta["num_descr"]]
+        rec = np.frombuffer(num_blob, dtype=np.dtype(dtype_descr))
+        df_num = pd.DataFrame(rec, columns=num_cols)
+    else:
+        df_num = pd.DataFrame(index=idx)
+
+    # 6) rebuild object‐dtype DataFrame
+    obj_cols = meta["obj_cols"]
+    if obj_cols:
+        obj_data = msgpack.unpackb(obj_blob, raw=False)
+        df_obj = pd.DataFrame(obj_data)
+    else:
+        df_obj = pd.DataFrame()
+
+    # 7) combine, restore index, and **reorder**:
+    df = pd.concat([df_num, df_obj], axis=1)
+    df.index = idx
+
+    # ← HERE: reorder exactly as original
+    df = df[meta["orig_cols"]]
+
+    return df
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Contributors`
`2`	`2`
`3`		`-* Meir Tseitlin [meir@imubit.com](mailto:meir@imubit.com)`
	`3`	`+* Meir Tseitlin [opensource@imubit.com](mailto:opensource@imubit.com)`