Add migration utils (#209)

antoniofilipovic · web-flow · commit 8aaa68c1f9a6 · 2023-07-20T16:41:12.000+02:00
diff --git a/.github/workflows/docker_publish.yml b/.github/workflows/docker_publish.yml
@@ -37,7 +37,7 @@ jobs:
           prefix: "v"
 
       - run: echo "MAGE is at version ${{ steps.get-latest-tag.outputs.tag }}"
-      
+
       - name: Get commit tag
         id: get-commit-tag
         run: |
@@ -58,7 +58,6 @@ jobs:
             echo "::set-output name=LATEST::fix"
           fi
       - run: echo "Additional tag for production image - ${{ steps.get-prod-tags.outputs.LATEST }}"
-
       - name: Log in to Docker Hub
         uses: docker/login-action@v1
         with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,7 +9,7 @@ on: [pull_request, workflow_dispatch]
 
 jobs:
   build:
-    runs-on:  ubuntu-latest
+    runs-on: ubuntu-latest
     env:
       MEMGRAPH_VERSION: 2.8.0
     strategy:
@@ -26,7 +26,7 @@ jobs:
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v2
-      
+
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v2
@@ -68,7 +68,7 @@ jobs:
         uses: actions/setup-python@v2
         with:
           python-version: ${{ env.PY_VERSION }}
-      
+
       - name: Install Python test dependencies
         run: |
           python -m pip install --upgrade pip
diff --git a/Dockerfile b/Dockerfile
@@ -26,6 +26,7 @@ RUN apt-get update && apt-get install -y \
     python3-dev     `mage-memgraph` \
     clang           `mage-memgraph` \
     git             `mage-memgraph` \
+    unixodbc        `mage-memgraph` \
     libboost-all-dev `mage-memgraph` \
     --no-install-recommends \
     # Download and install Memgraph
diff --git a/Dockerfile.release b/Dockerfile.release
@@ -24,6 +24,7 @@ RUN apt-get update && apt-get install -y \
     python3-dev     `mage-memgraph` \
     clang           `mage-memgraph` \
     git             `mage-memgraph` \
+    unixodbc-dev        `mage-memgraph` \
     --no-install-recommends \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
diff --git a/README.md b/README.md
@@ -197,7 +197,8 @@ To learn more about development with MAGE and Docker, visit the
   - python3-pip 
   - python3-setuptools    
   - python3-dev     
-  - clang  
+  - clang
+  - unixodbc  
 
 Since Memgraph needs to load MAGE's modules, there is the `setup` script to help you. With it, you can build the modules so that Memgraph
 can load them on start up.
diff --git a/python/migrate.py b/python/migrate.py
@@ -0,0 +1,296 @@
+import json
+import mgp
+import mysql.connector as mysql_connector
+import oracledb
+import pyodbc
+import threading
+
+from typing import Any, Dict
+
+
+class Constants:
+    I_COLUMN_NAME = 0
+    CURSOR = "cursor"
+    COLUMN_NAMES = "column_names"
+    CONNECTION = "connection"
+    BATCH_SIZE = 1000
+
+
+##### MYSQL
+
+mysql_dict = {}
+
+
+def init_migrate_mysql(
+    table_or_sql: str,
+    config: mgp.Map,
+    config_path: str = "",
+    params: mgp.Nullable[mgp.Any] = None,
+):
+    global mysql_dict
+
+    if params:
+        _check_params_type(params)
+    if len(config_path) > 0:
+        config = _combine_config(config=config, config_path=config_path)
+
+    if _query_is_table(table_or_sql):
+        table_or_sql = f"SELECT * FROM {table_or_sql};"
+
+    if threading.get_native_id not in mysql_dict:
+        mysql_dict[threading.get_native_id] = {}
+
+    if Constants.CURSOR not in mysql_dict[threading.get_native_id]:
+        mysql_dict[threading.get_native_id][Constants.CURSOR] = None
+
+    if mysql_dict[threading.get_native_id][Constants.CURSOR] is None:
+        connection = mysql_connector.connect(**config)
+        cursor = connection.cursor()
+        cursor.execute(table_or_sql, params=params)
+
+        mysql_dict[threading.get_native_id][Constants.CONNECTION] = connection
+        mysql_dict[threading.get_native_id][Constants.CURSOR] = cursor
+        mysql_dict[threading.get_native_id][Constants.COLUMN_NAMES] = [
+            column[Constants.I_COLUMN_NAME] for column in cursor.description
+        ]
+
+
+def mysql(
+    table_or_sql: str,
+    config: mgp.Map,
+    config_path: str = "",
+    params: mgp.Nullable[mgp.Any] = None,
+) -> mgp.Record(row=mgp.Map):
+    """
+    With migrate.mysql you can access MySQL and execute queries. The result table is converted into a stream,
+    and returned rows can be used to create or create graph structures. Config must be at least empty map.
+    If config_path is passed, every key,value pair from JSON file will overwrite any values in config file.
+
+    :param table_or_sql: Table name or an SQL query
+    :param config: Connection configuration parameters (as in mysql.connector.connect),
+    :param config_path: Path to the JSON file containing configuration parameters (as in mysql.connector.connect)
+    :param params: Optionally, queries may be parameterized. In that case, `params` provides parameter values
+    :return: The result table as a stream of rows
+    """
+    global mysql_dict
+    cursor = mysql_dict[threading.get_native_id][Constants.CURSOR]
+    column_names = mysql_dict[threading.get_native_id][Constants.COLUMN_NAMES]
+
+    rows = cursor.fetchmany(Constants.BATCH_SIZE)
+
+    return [mgp.Record(row=_name_row_cells(row, column_names)) for row in rows]
+
+
+def cleanup_migrate_mysql():
+    global mysql_dict
+    mysql_dict[threading.get_native_id][Constants.CURSOR] = None
+    mysql_dict[threading.get_native_id][Constants.CONNECTION].close()
+    mysql_dict[threading.get_native_id][Constants.CONNECTION].commit()
+    mysql_dict[threading.get_native_id][Constants.CONNECTION] = None
+    mysql_dict[threading.get_native_id][Constants.COLUMN_NAMES] = None
+
+
+mgp.add_batch_read_proc(mysql, init_migrate_mysql, cleanup_migrate_mysql)
+
+### SQL SERVER
+
+sql_server_dict = {}
+
+
+def init_migrate_sql_server(
+    table_or_sql: str,
+    config: mgp.Map,
+    config_path: str = "",
+    params: mgp.Nullable[mgp.Any] = None,
+):
+    global sql_server_dict
+
+    if params:
+        _check_params_type(params, (list, tuple))
+    else:
+        params = []
+
+    if len(config_path) > 0:
+        config = _combine_config(config=config, config_path=config_path)
+
+    if _query_is_table(table_or_sql):
+        table_or_sql = f"SELECT * FROM {table_or_sql};"
+
+    if threading.get_native_id not in sql_server_dict:
+        sql_server_dict[threading.get_native_id] = {}
+
+    if Constants.CURSOR not in sql_server_dict[threading.get_native_id]:
+        sql_server_dict[threading.get_native_id][Constants.CURSOR] = None
+
+    if sql_server_dict[threading.get_native_id][Constants.CURSOR] is None:
+        connection = pyodbc.connect(**config)
+        cursor = connection.cursor()
+        cursor.execute(table_or_sql, *params)
+
+        sql_server_dict[threading.get_native_id][Constants.CONNECTION] = connection
+        sql_server_dict[threading.get_native_id][Constants.CURSOR] = cursor
+        sql_server_dict[threading.get_native_id][Constants.COLUMN_NAMES] = [
+            column[Constants.I_COLUMN_NAME] for column in cursor.description
+        ]
+
+
+def sql_server(
+    table_or_sql: str,
+    config: mgp.Map,
+    config_path: str = "",
+    params: mgp.Nullable[mgp.Any] = None,
+) -> mgp.Record(row=mgp.Map):
+    """
+    With migrate.sql_server you can access SQL Server and execute queries. The result table is converted into a stream,
+    and returned rows can be used to create or create graph structures. Config must be at least empty map.
+    If config_path is passed, every key,value pair from JSON file will overwrite any values in config file.
+
+    :param table_or_sql: Table name or an SQL query
+    :param config: Connection configuration parameters (as in pyodbc.connect),
+    :param config_path: Path to the JSON file containing configuration parameters (as in pyodbc.connect)
+    :param params: Optionally, queries may be parameterized. In that case, `params` provides parameter values
+    :return: The result table as a stream of rows
+    """
+    global sql_server_dict
+
+    cursor = sql_server_dict[threading.get_native_id][Constants.CURSOR]
+    column_names = sql_server_dict[threading.get_native_id][Constants.COLUMN_NAMES]
+    rows = cursor.fetchmany(Constants.BATCH_SIZE)
+
+    return [mgp.Record(row=_name_row_cells(row, column_names)) for row in rows]
+
+
+def cleanup_migrate_sql_server():
+    global sql_server_dict
+    sql_server_dict[threading.get_native_id][Constants.CURSOR] = None
+    sql_server_dict[threading.get_native_id][Constants.CONNECTION].close()
+    sql_server_dict[threading.get_native_id][Constants.CONNECTION].commit()
+    sql_server_dict[threading.get_native_id][Constants.CONNECTION] = None
+    sql_server_dict[threading.get_native_id][Constants.COLUMN_NAMES] = None
+
+
+mgp.add_batch_read_proc(sql_server, init_migrate_sql_server, cleanup_migrate_sql_server)
+
+### Oracle DB
+
+oracle_db_dict = {}
+
+
+def init_migrate_oracle_db(
+    table_or_sql: str,
+    config: mgp.Map,
+    config_path: str = "",
+    params: mgp.Nullable[mgp.Any] = None,
+):
+    global oracle_db_dict
+
+    if params:
+        _check_params_type(params)
+
+    if len(config_path) > 0:
+        config = _combine_config(config=config, config_path=config_path)
+
+    if _query_is_table(table_or_sql):
+        table_or_sql = f"SELECT * FROM {table_or_sql}"
+
+    if not config:
+        config = {}
+
+    # To prevent query execution from hanging
+    if "disable_oob" not in config:
+        config["disable_oob"] = True
+    else:
+        config["disable_oob"] = True  # overwrite
+
+    if threading.get_native_id not in oracle_db_dict:
+        oracle_db_dict[threading.get_native_id] = {}
+
+    if Constants.CURSOR not in oracle_db_dict[threading.get_native_id]:
+        oracle_db_dict[threading.get_native_id][Constants.CURSOR] = None
+
+    if oracle_db_dict[threading.get_native_id][Constants.CURSOR] is None:
+        connection = oracledb.connect(**config)
+        cursor = connection.cursor()
+
+        if not params:
+            cursor.execute(table_or_sql)
+        elif isinstance(params, (list, tuple)):
+            cursor.execute(table_or_sql, params)
+        else:
+            cursor.execute(table_or_sql, **params)
+
+        oracle_db_dict[threading.get_native_id][Constants.CONNECTION] = connection
+        oracle_db_dict[threading.get_native_id][Constants.CURSOR] = cursor
+        oracle_db_dict[threading.get_native_id][Constants.COLUMN_NAMES] = [
+            column[Constants.I_COLUMN_NAME] for column in cursor.description
+        ]
+
+
+def oracle_db(
+    table_or_sql: str,
+    config: mgp.Map,
+    config_path: str = "",
+    params: mgp.Nullable[mgp.Any] = None,
+) -> mgp.Record(row=mgp.Map):
+    """
+    With migrate.oracle_db you can access Oracle DB and execute queries. The result table is converted into a stream,
+    and returned rows can be used to create or create graph structures. Config must be at least empty map.
+    If config_path is passed, every key,value pair from JSON file will overwrite any values in config file.
+
+    :param table_or_sql: Table name or an SQL query
+    :param config: Connection configuration parameters (as in oracledb.connect),
+    :param config_path: Path to the JSON file containing configuration parameters (as in oracledb.connect)
+    :param params: Optionally, queries may be parameterized. In that case, `params` provides parameter values
+    :return: The result table as a stream of rows
+    """
+
+    global oracle_db_dict
+    cursor = oracle_db_dict[threading.get_native_id][Constants.CURSOR]
+    column_names = oracle_db_dict[threading.get_native_id][Constants.COLUMN_NAMES]
+    rows = cursor.fetchmany(Constants.BATCH_SIZE)
+
+    return [mgp.Record(row=_name_row_cells(row, column_names)) for row in rows]
+
+
+def cleanup_migrate_oracle_db():
+    global oracle_db_dict
+    oracle_db_dict[threading.get_native_id][Constants.CURSOR] = None
+    oracle_db_dict[threading.get_native_id][Constants.CONNECTION].close()
+    oracle_db_dict[threading.get_native_id][Constants.CONNECTION].commit()
+    oracle_db_dict[threading.get_native_id][Constants.CONNECTION] = None
+    oracle_db_dict[threading.get_native_id][Constants.COLUMN_NAMES] = None
+
+
+mgp.add_batch_read_proc(oracle_db, init_migrate_oracle_db, cleanup_migrate_oracle_db)
+
+
+def _query_is_table(table_or_sql: str) -> bool:
+    return len(table_or_sql.split()) == 1
+
+
+def _load_config(path: str) -> Dict[str, Any]:
+    try:
+        with open(path, mode="r") as config:
+            return json.load(config)
+    except Exception:
+        raise OSError("Could not open/read file.")
+
+
+def _combine_config(config: mgp.Map, config_path: str) -> Dict[str, Any]:
+    assert len(config_path), "Path must not be empty"
+    config_items = _load_config(path=config_path)
+
+    for key, value in config_items.items():
+        config[key] = value
+    return config
+
+
+def _name_row_cells(row_cells, column_names) -> Dict[str, Any]:
+    return dict(map(lambda column, value: (column, value), column_names, row_cells))
+
+
+def _check_params_type(params: Any, types=(dict, list, tuple)) -> None:
+    if not isinstance(params, types):
+        raise TypeError(
+            "Database query parameter values must be passed in a container of type List[Any] (or Map, if migrating from MySQL or Oracle DB)"
+        )
diff --git a/python/requirements.txt b/python/requirements.txt
@@ -9,3 +9,6 @@ torchmetrics==0.9.3
 igraph==0.10.2
 scikit-learn==0.24.2
 gqlalchemy==1.4.1
+mysql-connector-python==8.0.32
+oracledb==1.2.2
+pyodbc==4.0.35