From cc7ffe39789130bcac6b6527b42061a2d3fe7f0b Mon Sep 17 00:00:00 2001
From: LoserCheems <3314685395@qq.com>
Date: Fri, 10 Oct 2025 15:39:08 +0800
Subject: [PATCH 1/5] Add issue/PR templates; relax mask/bias checks

Introduces standardized issue and pull request templates to streamline bug reports, feature proposals, and performance diagnostics.

Relaxes validation in variable-length attention forward by dropping dtype/device checks for mask and bias, enabling optional inputs and avoiding unnecessary failures.
---
 .github/ISSUE_TEMPLATE/bug_report.yml         | 69 ++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.yml    | 64 +++++++++++++++
 .github/ISSUE_TEMPLATE/performance_issue.yml  | 79 +++++++++++++++++++
 .github/PULL_REQUEST_TEMPLATE/bug_fix.yml     | 60 ++++++++++++++
 .../PULL_REQUEST_TEMPLATE/feature_support.yml | 60 ++++++++++++++
 .../performance_optimization.yml              | 61 ++++++++++++++
 csrc/flash_dmattn/flash_api.cpp               |  7 +-
 7 files changed, 395 insertions(+), 5 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml
 create mode 100644 .github/ISSUE_TEMPLATE/performance_issue.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE/bug_fix.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE/feature_support.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000..dfb007e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,69 @@
+name: Bug report
+description: Create a report to help us improve Flash-DMA
+title: "[BUG REPORT] "
+labels:
+  - bug
+assignees:
+  - LoserCheems
+  - Evanwu1125
+  - SNHuan
+  - Thanksyy
+  - ftgreat
+  - zacliu2023
+  - juliohsu
+  - wubingheng111
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to report an issue. Please fill out the details below so we can reproduce and fix the problem quickly.
+  - type: textarea
+    id: bug-description
+    attributes:
+      label: Describe the bug
+      description: Provide a concise description of the incorrect behaviour.
+      placeholder: Unexpected error when calling flash_dmattn(...)
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Steps to reproduce
+      description: Share the minimal steps or code necessary for us to see the failure.
+      placeholder: |
+        1. Import flash_dmattn
+        2. Run the snippet below
+        3. Observe the error
+      render: python
+    validations:
+      required: true
+  - type: textarea
+    id: expected-behavior
+    attributes:
+      label: Expected behaviour
+      description: Tell us what you expected to happen instead.
+      placeholder: The kernel should return valid attention output without raising an exception.
+    validations:
+      required: true
+  - type: textarea
+    id: environment
+    attributes:
+      label: Environment information
+      description: Run the following command and paste the full output.
+      placeholder: |
+        python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.version.cuda}'); print(f'GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else \"None\"}')"
+      render: shell
+    validations:
+      required: true
+  - type: textarea
+    id: additional-context
+    attributes:
+      label: Additional context
+      description: Include sequence lengths, batch sizes, or any other details that might help us debug.
+      placeholder: Tested with seq_len=8192, batch=2, head_dim=128...
+  - type: textarea
+    id: traceback
+    attributes:
+      label: Error traceback
+      description: Paste the full traceback if available.
+      render: text
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 0000000..46a7d39
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,64 @@
+name: Feature request
+description: Suggest an idea for FDMA
+title: "[FEATURE REQUEST] "
+labels:
+  - feature
+assignees:
+  - LoserCheems
+  - Evanwu1125
+  - SNHuan
+  - Thanksyy
+  - ftgreat
+  - zacliu2023
+  - juliohsu
+  - wubingheng111
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Help us understand the feature you are proposing and why it matters for Flash-DMA workflows.
+  - type: textarea
+    id: problem
+    attributes:
+      label: Problem statement
+      description: Explain the problem or limitation that motivates this feature request.
+      placeholder: I am limited by...
+    validations:
+      required: true
+  - type: textarea
+    id: proposed-solution
+    attributes:
+      label: Proposed solution
+      description: Describe the feature or behaviour you would like to see.
+      placeholder: Introduce a kernel path that...
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives considered
+      description: List any other approaches you have evaluated and why they are insufficient.
+      placeholder: I tried using...
+  - type: textarea
+    id: implementation
+    attributes:
+      label: Implementation details
+      description: Call out potential CUDA/Python changes, performance implications, or compatibility considerations.
+      placeholder: Requires updates to flash_dmattn_interface and CUDA op...
+  - type: textarea
+    id: use-case
+    attributes:
+      label: Use case
+      description: Describe the workloads or scenarios that would benefit from this feature.
+      placeholder: Long-context code completion with...
+  - type: textarea
+    id: references
+    attributes:
+      label: Related work
+      description: Share links to papers, repositories, or prior art that inspired this request.
+      placeholder: Paper link or repository URL
+  - type: textarea
+    id: additional-context
+    attributes:
+      label: Additional context
+      description: Add any extra information or screenshots that may help us understand the request.
diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml
new file mode 100644
index 0000000..6708f42
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/performance_issue.yml
@@ -0,0 +1,79 @@
+name: Performance issue
+description: Report performance problems or optimisation opportunities
+title: "[PERFORMANCE] "
+labels:
+  - performance
+assignees:
+  - LoserCheems
+  - Evanwu1125
+  - SNHuan
+  - Thanksyy
+  - ftgreat
+  - zacliu2023
+  - juliohsu
+  - wubingheng111
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Provide enough detail about performance regressions or optimisation opportunities so we can reproduce and diagnose them.
+  - type: textarea
+    id: issue-description
+    attributes:
+      label: Performance issue description
+      description: Summarise the performance problem.
+      placeholder: Forward latency increases when...
+    validations:
+      required: true
+  - type: textarea
+    id: current-performance
+    attributes:
+      label: Current performance metrics
+      description: Share benchmark numbers and configuration (sequence length, batch size, heads, head dimension, throughput, memory usage).
+      placeholder: |
+        Sequence length: 8192
+        Batch size: 2
+        Heads: 32
+        Head dim: 128
+        Speed: 15.2 ms/iteration
+        Memory: 8.5 GB
+    validations:
+      required: true
+  - type: textarea
+    id: expected-performance
+    attributes:
+      label: Expected performance
+      description: Explain what performance you expect and the baseline you are comparing against.
+      placeholder: Expect <10 ms/iteration based on Flash Attention benchmark...
+  - type: textarea
+    id: environment
+    attributes:
+      label: Environment information
+      description: Run the following command and paste the output.
+      placeholder: |
+        python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.version.cuda}'); print(f'GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else \"None\"}')"
+      render: shell
+    validations:
+      required: true
+  - type: textarea
+    id: benchmark-code
+    attributes:
+      label: Benchmark code
+      description: Provide the code snippet or script used to measure performance.
+      render: python
+  - type: textarea
+    id: profiling
+    attributes:
+      label: Profiling information
+      description: Include relevant excerpts from nsys, nvprof, or PyTorch profiler if available.
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System information
+      description: GPU model, compute capability, CPU, RAM, and other hardware details.
+      placeholder: RTX 4090 24GB, compute capability 8.9, Intel i9-14900K, 64GB RAM
+  - type: textarea
+    id: additional-context
+    attributes:
+      label: Additional context
+      description: Mention regressions, different batch sizes, attention patterns, or other observations.
diff --git a/.github/PULL_REQUEST_TEMPLATE/bug_fix.yml b/.github/PULL_REQUEST_TEMPLATE/bug_fix.yml
new file mode 100644
index 0000000..766415f
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/bug_fix.yml
@@ -0,0 +1,60 @@
+name: Bug Fix
+description: Fix a bug with clear reproduction, scope, and tests
+title: "[BUG FIX] "
+labels:
+  - bug
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for contributing a bug fix! Please complete the sections below so reviewers can understand and verify the change quickly.
+  - type: textarea
+    id: summary
+    attributes:
+      label: Summary
+      description: What bug is fixed and what parts of the codebase are impacted?
+      placeholder: Resolves crash when...
+    validations:
+      required: true
+  - type: textarea
+    id: root-cause
+    attributes:
+      label: Root cause
+      description: Briefly describe the underlying issue.
+      placeholder: The kernel assumed...
+  - type: textarea
+    id: changes
+    attributes:
+      label: Changes
+      description: Highlight the notable code-level modifications.
+      placeholder: Updated flash_dmattn_interface to...
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Reproduction steps or MRE
+      description: Provide steps or a minimal snippet that reproduces the original bug.
+      render: python
+  - type: textarea
+    id: tests
+    attributes:
+      label: Tests
+      description: List the tests you added or ran and their results.
+      placeholder: Ran benchmarks/forward_equivalence.py; added unit test...
+    validations:
+      required: true
+  - type: textarea
+    id: compatibility
+    attributes:
+      label: Compatibility
+      description: Note any migration concerns or backwards compatibility considerations.
+  - type: checkboxes
+    id: checklist
+    attributes:
+      label: Checklist
+      options:
+        - label: Linked issue provided
+        - label: Adds or updates tests
+        - label: Updates docs if needed
+        - label: No performance regressions observed
diff --git a/.github/PULL_REQUEST_TEMPLATE/feature_support.yml b/.github/PULL_REQUEST_TEMPLATE/feature_support.yml
new file mode 100644
index 0000000..46bf35d
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/feature_support.yml
@@ -0,0 +1,60 @@
+name: Feature Support
+description: Introduce a new feature with design context and tests
+title: "[FEATURE SUPPORT] "
+labels:
+  - feature
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Share enough detail about the new feature so reviewers can evaluate scope, design, and testing.
+  - type: textarea
+    id: summary
+    attributes:
+      label: Summary
+      description: What feature is being added and why?
+      placeholder: Adds configurable...
+    validations:
+      required: true
+  - type: textarea
+    id: design
+    attributes:
+      label: Design
+      description: Outline the design or architecture and mention alternatives considered.
+      placeholder: Uses new backend selection flow...
+  - type: textarea
+    id: changes
+    attributes:
+      label: Changes
+      description: Describe new or changed public APIs, configuration, or CLI behaviour.
+      placeholder: Adds flash_dmattn.feature_flag...
+    validations:
+      required: true
+  - type: textarea
+    id: implementation-notes
+    attributes:
+      label: Implementation notes
+      description: Highlight tricky parts or noteworthy implementation details.
+  - type: textarea
+    id: tests
+    attributes:
+      label: Tests
+      description: List unit or integration tests you added or updated and how you validated them.
+      placeholder: Ran benchmarks/forward_equivalence.py; added example in...
+    validations:
+      required: true
+  - type: textarea
+    id: docs
+    attributes:
+      label: Documentation
+      description: Mention doc updates or examples that accompany this feature.
+  - type: checkboxes
+    id: checklist
+    attributes:
+      label: Checklist
+      options:
+        - label: Linked issue provided
+        - label: API stabilised
+        - label: Tests added or updated
+        - label: Docs added or updated
+        - label: No known performance regressions
diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
new file mode 100644
index 0000000..8c0b688
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
@@ -0,0 +1,61 @@
+name: Performance Optimization
+description: Optimise performance with benchmark evidence
+title: "[PERFORMANCE OPTIMIZATION] "
+labels:
+  - performance
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Document the optimisation, methodology, and results so reviewers can validate gains and correctness.
+  - type: textarea
+    id: summary
+    attributes:
+      label: Summary
+      description: What is optimised and why?
+      placeholder: Improves forward latency for...
+    validations:
+      required: true
+  - type: textarea
+    id: baseline
+    attributes:
+      label: Baseline metrics
+      description: Provide the current performance numbers and environment.
+      placeholder: Baseline throughput 150 tok/s on H100 with...
+    validations:
+      required: true
+  - type: textarea
+    id: approach
+    attributes:
+      label: Approach
+      description: Describe the optimisation techniques used.
+      placeholder: Introduced block-wise accumulation...
+  - type: textarea
+    id: results
+    attributes:
+      label: Results
+      description: Share before/after benchmarks and how to reproduce them.
+      placeholder: |
+        Before: 15.2 ms/iteration (benchmark command)
+        After: 9.8 ms/iteration (benchmark command)
+    validations:
+      required: true
+  - type: textarea
+    id: impact
+    attributes:
+      label: Impact
+      description: Note memory, throughput trade-offs, or hardware-specific considerations.
+  - type: textarea
+    id: risks
+    attributes:
+      label: Risks
+      description: Highlight edge cases, correctness risks, or gating tests added.
+  - type: checkboxes
+    id: checklist
+    attributes:
+      label: Checklist
+      options:
+        - label: Linked issue provided
+        - label: Benchmarks included and reproducible
+        - label: No accuracy regression
+        - label: Docs updated where needed
diff --git a/csrc/flash_dmattn/flash_api.cpp b/csrc/flash_dmattn/flash_api.cpp
index 8e68df2..c033d73 100644
--- a/csrc/flash_dmattn/flash_api.cpp
+++ b/csrc/flash_dmattn/flash_api.cpp
@@ -584,14 +584,11 @@ mha_varlen_fwd(
     TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, "FlashDynamicMaskAttention only support fp16 and bf16 data type");
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
-    TORCH_CHECK(mask.dtype() == torch::kBool, "mask must have dtype bool");
-    TORCH_CHECK(bias.dtype() == q_dtype, "bias must have the same dtype as inputs");
     TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
     TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
 
-    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); CHECK_DEVICE(mask); CHECK_DEVICE(bias);
-    CHECK_DEVICE(cu_seqlens_q);
-    CHECK_DEVICE(cu_seqlens_k);
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k);
 
     at::Tensor block_table;
     // const bool paged_KV = block_table_.has_value();

From 09974b7b94a3e9e161f25f4ce3733dbd247f3725 Mon Sep 17 00:00:00 2001
From: Jingze Shi <losercheems@gmail.com>
Date: Fri, 10 Oct 2025 15:41:48 +0800
Subject: [PATCH 2/5] Update
 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
index 8c0b688..2eb165d 100644
--- a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
+++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
@@ -1,5 +1,5 @@
 name: Performance Optimization
-description: Optimise performance with benchmark evidence
+description: Optimize performance with benchmark evidence
 title: "[PERFORMANCE OPTIMIZATION] "
 labels:
   - performance

From b3d5308d33f2739dc7d219823fe2ad48f0503118 Mon Sep 17 00:00:00 2001
From: Jingze Shi <losercheems@gmail.com>
Date: Fri, 10 Oct 2025 15:41:55 +0800
Subject: [PATCH 3/5] Update
 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
index 2eb165d..3c106a0 100644
--- a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
+++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
@@ -28,7 +28,7 @@ body:
     id: approach
     attributes:
       label: Approach
-      description: Describe the optimisation techniques used.
+      description: Describe the optimization techniques used.
       placeholder: Introduced block-wise accumulation...
   - type: textarea
     id: results

From d73f150124b668f8ee0727d848d167e5ca04af1a Mon Sep 17 00:00:00 2001
From: Jingze Shi <losercheems@gmail.com>
Date: Fri, 10 Oct 2025 15:42:02 +0800
Subject: [PATCH 4/5] Update .github/ISSUE_TEMPLATE/performance_issue.yml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/ISSUE_TEMPLATE/performance_issue.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml
index 6708f42..c3ecca4 100644
--- a/.github/ISSUE_TEMPLATE/performance_issue.yml
+++ b/.github/ISSUE_TEMPLATE/performance_issue.yml
@@ -16,7 +16,7 @@ body:
   - type: markdown
     attributes:
       value: |
-        Provide enough detail about performance regressions or optimisation opportunities so we can reproduce and diagnose them.
+        Provide enough detail about performance regressions or optimization opportunities so we can reproduce and diagnose them.
   - type: textarea
     id: issue-description
     attributes:

From 3689550f5ad843a5176d443ae6d7c60247b0a7a6 Mon Sep 17 00:00:00 2001
From: Jingze Shi <losercheems@gmail.com>
Date: Fri, 10 Oct 2025 15:42:15 +0800
Subject: [PATCH 5/5] Update
 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
index 3c106a0..5a00c15 100644
--- a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
+++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml
@@ -12,7 +12,7 @@ body:
     id: summary
     attributes:
       label: Summary
-      description: What is optimised and why?
+      description: What is optimized and why?
       placeholder: Improves forward latency for...
     validations:
       required: true