Use thread fence instead of atomic

hipudding · hipudding · commit 42ee0cbf533e · 2025-04-15T10:38:53.000Z
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
@@ -241,7 +241,7 @@ class cann_task_queue {
      */
     explicit cann_task_queue(size_t capacity, int32_t device)
         : buffer_(capacity), capacity_(capacity), head_(0), tail_(0),
-          running_(false), device_(device), consuming_(false) {
+          running_(false), device_(device) {
         GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2");
         mask_ = capacity_ - 1;
     }
@@ -253,92 +253,52 @@ class cann_task_queue {
      * @return true if the task was successfully enqueued, false if the queue was full.
      */
     bool enqueue(std::unique_ptr<cann_task>&& item) {
-        size_t tail = tail_.load(std::memory_order_relaxed);
-        size_t next_tail = (tail + 1) & mask_;
+        size_t next_tail = (tail_ + 1) & mask_;
 
-        if (next_tail == head_.load(std::memory_order_acquire)) {
+        if (next_tail == head_) {
             return false;
         }
 
-        buffer_[tail] = std::move(item);
-        tail_.store(next_tail, std::memory_order_release);
-
-        cv_.notify_one();
+        buffer_[tail_] = std::move(item);
+        std::atomic_thread_fence(std::memory_order_release);
+        tail_ = next_tail;
 
         return true;
     }
 
-    /**
-     * @brief Dequeues all available tasks in bulk into an output vector.
-     * 
-     * @param output Output vector that will contain the dequeued tasks.
-     * @return Number of tasks dequeued.
-     */
-    size_t dequeue_bulk(std::vector<std::unique_ptr<cann_task>>& output) {
-        output.clear();
-        size_t head = head_.load(std::memory_order_relaxed);
-        size_t tail = tail_.load(std::memory_order_acquire);
-
-        while (running_ && head == tail) {
-            std::unique_lock<std::mutex> lock(mutex_);
-            cv_.wait(lock);
-            head = head_.load(std::memory_order_relaxed);
-            tail = tail_.load(std::memory_order_acquire);
-        }
-
-        size_t count = 0;
-        while (running_ && head != tail) {
-            output.push_back(std::move(buffer_[head]));
-            head = (head + 1) & mask_;
-            ++count;
-        }
-
-        head_.store(head, std::memory_order_release);
-        return count;
-    }
-
     /**
      * @brief Submits a task to the queue, and starts the worker thread if not already running.
      * 
      * @param task Task to be submitted.
      */
     void submit_task(std::unique_ptr<cann_task>&& task) {
-        while(!enqueue(std::move(task))) continue;
+        while(!enqueue(std::move(task))) {
+            std::this_thread::yield();
+            continue;
+        }
         
         if (!running_) {
-            thread_ = std::thread(&cann_task_queue::execute, this);
             running_ = true;
+            thread_ = std::thread(&cann_task_queue::execute, this);
         }
         
     }
 
-    /**
-     * @brief Checks whether the queue is empty.
-     * 
-     * @return true if the queue is empty, false otherwise.
-     */
-    bool empty() const {
-        return head_.load(std::memory_order_acquire) ==
-               tail_.load(std::memory_order_acquire);
-    }
-
     /**
      * @brief Waits until the queue is completely empty and no tasks are being processed.
      */
     void wait() {
-        if (!running_)
-            return;
-
-        while (!(empty() && consuming_)) {}
+        while (running_ && head_ != tail_) {
+            std::this_thread::yield();
+            continue;
+        }
     }
 
     /**
      * @brief Stops the task queue and joins the worker thread.
      */
     void stop() {
         running_ = false;
-        wait();
-        cv_.notify_all();
         if (thread_.joinable()) {
             thread_.join();
         }
@@ -349,33 +309,29 @@ class cann_task_queue {
      * @brief Worker thread function that continuously dequeues and executes tasks.
      */
     void execute() {
-        std::vector<std::unique_ptr<cann_task>> tasks;
         ggml_cann_set_device(device_);
 
-        while(running_) {
-            consuming_ = true;
-            int count = dequeue_bulk(tasks);
-            consuming_ = false;
-            if (count == 0)
+        while (running_) {
+            if(head_ == tail_) {
+                std::this_thread::yield();
                 continue;
-            
-            for(auto &task : tasks) {
-                task->run_task();
             }
+
+            std::atomic_thread_fence(std::memory_order_acquire);
+            buffer_[head_]->run_task();
+            buffer_[head_].reset();
+            head_ = (head_ + 1) & mask_;
         }
     }
 
     std::vector<std::unique_ptr<cann_task>> buffer_;
     const size_t capacity_;
     size_t mask_;
-    std::atomic<size_t> head_;
-    std::atomic<size_t> tail_;
-    std::mutex mutex_;
-    std::condition_variable cv_;
+    size_t head_;
+    size_t tail_;
     bool running_;
     std::thread thread_;
     int32_t device_;
-    bool consuming_;
 };
 
 /**