apache
diff --git a/‎.github/workflows/github-action-build.yml
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/github-action-build.yml
Lines changed: 7 additions & 0 deletions
diff --git a/‎core/amber/requirements.txt
Lines changed: 6 additions & 1 deletion b/‎core/amber/requirements.txt
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/amber/src/main/python/core/models/schema/attribute_type.py
Lines changed: 3 additions & 1 deletion b/‎core/amber/src/main/python/core/models/schema/attribute_type.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎core/amber/src/main/python/core/models/schema/test_schema.py
Lines changed: 1 addition & 1 deletion b/‎core/amber/src/main/python/core/models/schema/test_schema.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/amber/src/main/python/core/storage/__init__.py b/‎core/amber/src/main/python/core/storage/__init__.py
diff --git a/‎core/amber/src/main/python/core/storage/document_factory.py
Lines changed: 103 additions & 0 deletions b/‎core/amber/src/main/python/core/storage/document_factory.py
Lines changed: 103 additions & 0 deletions
diff --git a/‎core/amber/src/main/python/core/storage/iceberg/__init__.py b/‎core/amber/src/main/python/core/storage/iceberg/__init__.py
diff --git a/‎core/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py
Lines changed: 43 additions & 0 deletions b/‎core/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py
Lines changed: 43 additions & 0 deletions
@@ -111,6 +111,13 @@ jobs:
           if [ -f core/amber/requirements.txt ]; then pip install -r core/amber/requirements.txt; fi
           if [ -f core/amber/r-requirements.txt ]; then pip install -r core/amber/r-requirements.txt; fi
           if [ -f core/amber/operator-requirements.txt ]; then pip install -r core/amber/operator-requirements.txt; fi
+      - name: Install PostgreSQL
+        run: sudo apt-get update && sudo apt-get install -y postgresql
+      - name: Start PostgreSQL Service
+        run: sudo systemctl start postgresql
+      - name: Create Database and User
+        run: |
+          cd core/scripts/sql && sudo -u postgres psql -f iceberg_postgres_catalog.sql
       - name: Lint with flake8 and black
         run: |
           cd core/amber/src/main/python && flake8 && black . --check
 
@@ -26,4 +26,9 @@ bidict==0.22.0
 cached_property==1.5.2
 psutil==5.9.0
 transformers==4.44.2
-tzlocal==2.1
+tzlocal==2.1
+pyiceberg==0.8.1
+readerwriterlock==1.0.9
+tenacity==8.5.0
+SQLAlchemy==2.0.37
+psycopg2==2.9.10
@@ -42,16 +42,18 @@ class AttributeType(Enum):
     AttributeType.DOUBLE: pa.float64(),
     AttributeType.BOOL: pa.bool_(),
     AttributeType.BINARY: pa.binary(),
-    AttributeType.TIMESTAMP: pa.timestamp("ms", tz="UTC"),
+    AttributeType.TIMESTAMP: pa.timestamp("us"),
 }
 
 FROM_ARROW_MAPPING = {
     lib.Type_INT32: AttributeType.INT,
     lib.Type_INT64: AttributeType.LONG,
     lib.Type_STRING: AttributeType.STRING,
+    lib.Type_LARGE_STRING: AttributeType.STRING,
     lib.Type_DOUBLE: AttributeType.DOUBLE,
     lib.Type_BOOL: AttributeType.BOOL,
     lib.Type_BINARY: AttributeType.BINARY,
+    lib.Type_LARGE_BINARY: AttributeType.BINARY,
     lib.Type_TIMESTAMP: AttributeType.TIMESTAMP,
 }
 
 
@@ -27,7 +27,7 @@ def arrow_schema(self):
                 pa.field("field-3", pa.int64()),
                 pa.field("field-4", pa.float64()),
                 pa.field("field-5", pa.bool_()),
-                pa.field("field-6", pa.timestamp("ms", tz="UTC")),
+                pa.field("field-6", pa.timestamp("us")),
                 pa.field("field-7", pa.binary()),
             ]
         )
 
@@ -0,0 +1,103 @@
+from urllib.parse import urlparse
+
+from typing import Optional
+
+from core.models import Schema, Tuple
+from core.storage.iceberg.iceberg_catalog_instance import IcebergCatalogInstance
+from core.storage.iceberg.iceberg_document import IcebergDocument
+from core.storage.iceberg.iceberg_utils import (
+    create_table,
+    amber_tuples_to_arrow_table,
+    arrow_table_to_amber_tuples,
+    load_table_metadata,
+)
+from core.storage.model.virtual_document import VirtualDocument
+from core.storage.storage_config import StorageConfig
+from core.storage.vfs_uri_factory import VFSURIFactory, VFSResourceType
+
+
+class DocumentFactory:
+    """
+    Factory class to create and open documents.
+    Currently only iceberg documents are supported.
+    """
+
+    ICEBERG = "iceberg"
+
+    @staticmethod
+    def sanitize_uri_path(uri):
+        return uri.path.lstrip("/").replace("/", "_")
+
+    @staticmethod
+    def create_document(uri: str, schema: Schema) -> VirtualDocument:
+        parsed_uri = urlparse(uri)
+        if parsed_uri.scheme == VFSURIFactory.VFS_FILE_URI_SCHEME:
+            _, _, _, _, resource_type = VFSURIFactory.decode_uri(uri)
+
+            if resource_type in {
+                VFSResourceType.RESULT,
+                VFSResourceType.MATERIALIZED_RESULT,
+            }:
+                storage_key = DocumentFactory.sanitize_uri_path(parsed_uri)
+
+                iceberg_schema = Schema.as_arrow_schema(schema)
+
+                create_table(
+                    IcebergCatalogInstance.get_instance(),
+                    StorageConfig.ICEBERG_TABLE_NAMESPACE,
+                    storage_key,
+                    iceberg_schema,
+                    override_if_exists=True,
+                )
+
+                return IcebergDocument[Tuple](
+                    StorageConfig.ICEBERG_TABLE_NAMESPACE,
+                    storage_key,
+                    iceberg_schema,
+                    amber_tuples_to_arrow_table,
+                    arrow_table_to_amber_tuples,
+                )
+            else:
+                raise ValueError(f"Resource type {resource_type} is not supported")
+        else:
+            raise NotImplementedError(
+                f"Unsupported URI scheme: {parsed_uri.scheme} for creating the document"
+            )
+
+    @staticmethod
+    def open_document(uri: str) -> (VirtualDocument, Optional[Schema]):
+        parsed_uri = urlparse(uri)
+        if parsed_uri.scheme == "vfs":
+            _, _, _, _, resource_type = VFSURIFactory.decode_uri(uri)
+
+            if resource_type in {
+                VFSResourceType.RESULT,
+                VFSResourceType.MATERIALIZED_RESULT,
+            }:
+                storage_key = DocumentFactory.sanitize_uri_path(parsed_uri)
+
+                table = load_table_metadata(
+                    IcebergCatalogInstance.get_instance(),
+                    StorageConfig.ICEBERG_TABLE_NAMESPACE,
+                    storage_key,
+                )
+
+                if table is None:
+                    raise ValueError("No storage is found for the given URI")
+
+                amber_schema = Schema(table.schema().as_arrow())
+
+                document = IcebergDocument(
+                    StorageConfig.ICEBERG_TABLE_NAMESPACE,
+                    storage_key,
+                    table.schema(),
+                    amber_tuples_to_arrow_table,
+                    arrow_table_to_amber_tuples,
+                )
+                return document, amber_schema
+            else:
+                raise ValueError(f"Resource type {resource_type} is not supported")
+        else:
+            raise NotImplementedError(
+                f"Unsupported URI scheme: {parsed_uri.scheme} for opening the document"
+            )
@@ -0,0 +1,43 @@
+from pyiceberg.catalog import Catalog
+from typing import Optional
+
+from core.storage.iceberg.iceberg_utils import create_postgres_catalog
+from core.storage.storage_config import StorageConfig
+
+
+class IcebergCatalogInstance:
+    """
+    IcebergCatalogInstance is a singleton that manages the Iceberg catalog instance.
+    Currently only postgres SQL catalog is supported.
+    - Provides a single shared catalog for all Iceberg table-related operations.
+    - Lazily initializes the catalog on first access.
+    - Supports replacing the catalog instance for testing or reconfiguration.
+    """
+
+    _instance: Optional[Catalog] = None
+
+    @classmethod
+    def get_instance(cls):
+        """
+        Retrieves the singleton Iceberg catalog instance.
+        - If the catalog is not initialized, it is lazily created using the configured
+        properties.
+        :return: the Iceberg catalog instance.
+        """
+        if cls._instance is None:
+            cls._instance = create_postgres_catalog(
+                "texera_iceberg",
+                StorageConfig.ICEBERG_FILE_STORAGE_DIRECTORY_PATH,
+                StorageConfig.ICEBERG_POSTGRES_CATALOG_USERNAME,
+                StorageConfig.ICEBERG_POSTGRES_CATALOG_PASSWORD,
+            )
+        return cls._instance
+
+    @classmethod
+    def replace_instance(cls, catalog: Catalog):
+        """
+        Replaces the existing Iceberg catalog instance.
+        - This method is useful for testing or dynamically updating the catalog.
+        :param catalog: the new Iceberg catalog instance to replace the current one.
+        """
+        cls._instance = catalog
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def arrow_schema(self):`
`27`	`27`	`pa.field("field-3", pa.int64()),`
`28`	`28`	`pa.field("field-4", pa.float64()),`
`29`	`29`	`pa.field("field-5", pa.bool_()),`
`30`		`- pa.field("field-6", pa.timestamp("ms", tz="UTC")),`
	`30`	`+ pa.field("field-6", pa.timestamp("us")),`
`31`	`31`	`pa.field("field-7", pa.binary()),`
`32`	`32`	`]`
`33`	`33`	`)`