InfiniTensor · Ceng23333 · Oct 20, 2025 · Oct 29, 2025 · PanZezhong1725 · Oct 21, 2025
diff --git a/build_fixed.sh b/build_fixed.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Fixed build script for InfiniCore
+# This script sets up the environment and builds with proper linker configuration
+
+echo "Setting up InfiniCore build environment..."
+
+# Initialize conda
+eval "$(conda shell.bash hook)"
+
+# Activate the infinicore-env environment
+conda activate infinicore-env
+
+# Set CUDA_HOME to the conda environment
+export CUDA_HOME=$CONDA_PREFIX
+
+# Clean up conflicting environment variables
+unset CC
+unset CXX
+unset NVCC_PREPEND_FLAGS
+unset NVCC_APPEND_FLAGS
+unset CUDA_ROOT
+
+# Use system tools
+export PATH="/usr/bin:$PATH"
+
+# Create a wrapper for ld that converts -m64 to -m elf_x86_64
+mkdir -p /tmp/ld_wrapper
+cat > /tmp/ld_wrapper/ld << 'EOF'
+#!/bin/bash
+# Convert -m64 to -m elf_x86_64 for system linker compatibility
+args=()
+skip_next=false
+for arg in "$@"; do
+    if [ "$skip_next" = true ]; then
+        skip_next=false
+        continue
+    fi
+    if [ "$arg" = "-m64" ]; then
+        args+=("-m" "elf_x86_64")
+    elif [ "$arg" = "-fopenmp" ]; then
+        # Skip -fopenmp flag for linker, but add libgomp
+        args+=("-lgomp")
+        continue
+    elif [ "$arg" = "-m" ]; then
+        # Skip -m flag and its argument if it's elf_x86_64 (to avoid duplication)
+        skip_next=true
+        continue
+    else
+        args+=("$arg")
+    fi
+done
+# Add standard C++ library and other required libraries
+args+=("-lstdc++" "-lm" "-lc" "-lgcc_s")
+exec /usr/bin/ld "${args[@]}"
+EOF
+chmod +x /tmp/ld_wrapper/ld
+export PATH="/tmp/ld_wrapper:$PATH"
+
+echo "Environment setup complete!"
+echo "CUDA_HOME: $CUDA_HOME"
+echo "CONDA_PREFIX: $CONDA_PREFIX"
+
+# Configure and build
+echo "Configuring xmake..."
+xmake f -c
+
+echo "Building InfiniCore..."
+xmake build
+
+echo "Build completed!"
diff --git a/example_memory_usage.py b/example_memory_usage.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+Example script showing how to use InfiniCore memory statistics
+to monitor memory usage during tensor operations.
+"""
+
+import sys
+import os
+
+# Add the current directory to Python path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+try:
+    import infinicore
+    print("✓ Successfully imported infinicore")
+except ImportError as e:
+    print(f"✗ Failed to import infinicore: {e}")
+    print("Make sure to build the project first with: xmake build _infinicore")
+    sys.exit(1)
+
+def get_memory_summary():
+    """Get a summary of current memory usage."""
+    try:
+        device_stats = infinicore.get_device_memory_stats()
+        return {
+            'allocations': device_stats.allocation[0].current,
+            'allocated_bytes': device_stats.allocated_bytes[0].current,
+            'active_blocks': device_stats.active[0].current,
+            'device_allocations': device_stats.num_device_alloc,
+            'device_deallocations': device_stats.num_device_free
+        }
+    except Exception as e:
+        print(f"Warning: Could not get memory stats: {e}")
+        return None
+
+def print_memory_summary(title, stats):
+    """Print a concise memory summary."""
+    if stats is None:
+        print(f"{title}: Unable to get memory statistics")
+        return
+
+    print(f"{title}:")
+    print(f"  Allocations: {stats['allocations']}")
+    print(f"  Allocated bytes: {stats['allocated_bytes']:,} bytes ({stats['allocated_bytes'] / 1024 / 1024:.2f} MB)")
+    print(f"  Active blocks: {stats['active_blocks']}")
+    print(f"  Device alloc/dealloc: {stats['device_allocations']}/{stats['device_deallocations']}")
+
+def monitor_memory_usage():
+    """Monitor memory usage during tensor operations."""
+    print("=== InfiniCore Memory Usage Monitor ===\n")
+
+    # Initial memory state
+    initial_stats = get_memory_summary()
+    print_memory_summary("Initial Memory State", initial_stats)
+
+    try:
+        # Create some tensors to demonstrate memory usage
+        print("\n1. Creating tensors...")
+
+        # Create a large tensor
+        print("   Creating 1000x1000 float32 tensor...")
+        tensor1 = infinicore.empty((1000, 1000), dtype=infinicore.float32)
+        stats_after_tensor1 = get_memory_summary()
+        print_memory_summary("After creating tensor1", stats_after_tensor1)
+
+        # Create another tensor
+        print("\n   Creating 500x500 float32 tensor...")
+        tensor2 = infinicore.empty((500, 500), dtype=infinicore.float32)
+        stats_after_tensor2 = get_memory_summary()
+        print_memory_summary("After creating tensor2", stats_after_tensor2)
+
+        # Create a third tensor
+        print("\n   Creating 2000x2000 float32 tensor...")
+        tensor3 = infinicore.empty((2000, 2000), dtype=infinicore.float32)
+        stats_after_tensor3 = get_memory_summary()
+        print_memory_summary("After creating tensor3", stats_after_tensor3)
+
+        # Delete some tensors
+        print("\n2. Deleting tensors...")
+        del tensor1
+        stats_after_del1 = get_memory_summary()
+        print_memory_summary("After deleting tensor1", stats_after_del1)
+
+        del tensor2
+        stats_after_del2 = get_memory_summary()
+        print_memory_summary("After deleting tensor2", stats_after_del2)
+
+        # Final cleanup
+        print("\n3. Final cleanup...")
+        del tensor3
+        final_stats = get_memory_summary()
+        print_memory_summary("Final Memory State", final_stats)
+
+        # Show memory difference
+        if initial_stats and final_stats:
+            print(f"\nMemory Usage Summary:")
+            print(f"  Net allocations: {final_stats['allocations'] - initial_stats['allocations']}")
+            print(f"  Net allocated bytes: {final_stats['allocated_bytes'] - initial_stats['allocated_bytes']:,} bytes")
+            print(f"  Net active blocks: {final_stats['active_blocks'] - initial_stats['active_blocks']}")
+
+        print("\n✓ Memory monitoring completed successfully!")
+
+    except Exception as e:
+        print(f"✗ Error during memory monitoring: {e}")
+        import traceback
+        traceback.print_exc()
+
+def demonstrate_stat_types():
+    """Demonstrate different stat types and their usage."""
+    print("\n=== Stat Types Demonstration ===\n")
+
+    try:
+        # Get device stats
+        device_stats = infinicore.get_device_memory_stats()
+
+        print("StatType.AGGREGATE statistics:")
+        print(f"  Allocation count: {device_stats.allocation[0].current}")
+        print(f"  Allocation peak: {device_stats.allocation[0].peak}")
+        print(f"  Allocation total: {device_stats.allocation[0].allocated}")
+        print(f"  Allocation freed: {device_stats.allocation[0].freed}")
+
+        print(f"\nStatType.SMALL_POOL statistics:")
+        print(f"  Allocation count: {device_stats.allocation[1].current}")
+        print(f"  Allocation peak: {device_stats.allocation[1].peak}")
+
+        print(f"\nStatType.LARGE_POOL statistics:")
+        print(f"  Allocation count: {device_stats.allocation[2].current}")
+        print(f"  Allocation peak: {device_stats.allocation[2].peak}")
+
+        print("\n✓ Stat types demonstration completed!")
+
+    except Exception as e:
+        print(f"✗ Error during stat types demonstration: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    monitor_memory_usage()
+    demonstrate_stat_types()
diff --git a/include/infinicore/context/context.hpp b/include/infinicore/context/context.hpp
@@ -21,9 +21,9 @@ infiniopHandle_t getInfiniopHandle();
 void syncStream();
 void syncDevice();
 
-std::shared_ptr<Memory> allocateMemory(size_t size);
-std::shared_ptr<Memory> allocateHostMemory(size_t size);
-std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
+std::shared_ptr<MemoryBlock> allocateMemory(size_t size);
+std::shared_ptr<MemoryBlock> allocateHostMemory(size_t size);
+std::shared_ptr<MemoryBlock> allocatePinnedHostMemory(size_t size);
 
 void memcpyH2D(void *dst, const void *src, size_t size);
 void memcpyD2H(void *dst, const void *src, size_t size);

diff --git a/include/infinicore/memory.hpp b/include/infinicore/memory.hpp
@@ -1,30 +1,5 @@
 #pragma once
 
-#include "device.hpp"
-
-#include <cstddef>
-#include <functional>
-
-namespace infinicore {
-
-class Memory {
-public:
-    using Deleter = std::function<void(std::byte *)>;
-
-    Memory(std::byte *data, size_t size, Device device, Deleter deleter, bool pin_memory = false);
-    ~Memory();
-
-    std::byte *data();
-    Device device() const;
-    size_t size() const;
-    bool is_pinned() const;
-
-private:
-    std::byte *data_;
-    size_t size_;
-    Device device_;
-    Deleter deleter_;
-    bool is_pinned_;
-};
-
-} // namespace infinicore
+#include "memory/memory_block.hpp"
+#include "memory/memory_pool.hpp"
+#include "memory/memory_segment.hpp"
diff --git a/include/infinicore/memory/memory_block.hpp b/include/infinicore/memory/memory_block.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "../device.hpp"
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+
+namespace infinicore {
+
+class MemoryBlock {
+public:
+    using Deleter = std::function<void(std::byte *)>;
+
+    MemoryBlock(std::byte *data, size_t size, Device device, Deleter deleter, bool pin_memory = false);
+    ~MemoryBlock();
+
+    // Copy constructor and copy assignment with reference counting
+    MemoryBlock(const MemoryBlock& other);
+    MemoryBlock& operator=(const MemoryBlock& other);
+
+    // Move constructor and move assignment
+    MemoryBlock(MemoryBlock&& other) noexcept;
+    MemoryBlock& operator=(MemoryBlock&& other) noexcept;
+
+    std::byte *data() const;
+    Device device() const;
+    size_t size() const;
+    bool is_pinned() const;
+
+private:
+    std::byte *data_;
+    size_t size_;
+    Device device_;
+    Deleter deleter_;
+    bool is_pinned_;
+};
+
+} // namespace infinicore
diff --git a/include/infinicore/memory/memory_pool.hpp b/include/infinicore/memory/memory_pool.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <mutex>
+#include <atomic>
+#include <cstddef>
+#include <functional>
+
+namespace infinicore {
+
+struct MemoryInfo {
+    std::byte* ptr;
+    size_t size;
+    std::atomic<int> ref_count;
+    bool is_freed;
+
+    MemoryInfo(std::byte* p, size_t s)
+        : ptr(p), size(s), ref_count(1), is_freed(false) {}
+};
+
+class MemoryPool {
+public:
+    static MemoryPool& instance();
+
+    // Register a memory allocation
+    void registerMemory(std::byte* ptr, size_t size);
+
+    // Increment reference count
+    void addRef(std::byte* ptr);
+
+    // Decrement reference count and potentially free memory
+    void releaseMemory(std::byte* ptr, std::function<void(std::byte*)> actual_deleter);
+
+    // Get reference count
+    int getRefCount(std::byte* ptr) const;
+
+    // Check if memory is registered
+    bool isRegistered(std::byte* ptr) const;
+
+    // Check if memory is already freed
+    bool isFreed(std::byte* ptr) const;
+
+private:
+    MemoryPool() = default;
+    ~MemoryPool() = default;
+
+    mutable std::mutex mutex_;
+    std::unordered_map<std::byte*, std::shared_ptr<MemoryInfo>> memory_map_;
+};
+
+} // namespace infinicore
diff --git a/include/infinicore/memory/memory_segment.hpp b/include/infinicore/memory/memory_segment.hpp
diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp
@@ -32,7 +32,7 @@ struct TensorMetaData {
 
 struct TensorData {
     size_t offset;
-    std::shared_ptr<Memory> memory;
+    std::shared_ptr<MemoryBlock> memory;
 };
 
 struct TensorSliceParams {