From cc7ffe39789130bcac6b6527b42061a2d3fe7f0b Mon Sep 17 00:00:00 2001 From: LoserCheems <3314685395@qq.com> Date: Fri, 10 Oct 2025 15:39:08 +0800 Subject: [PATCH 1/5] Add issue/PR templates; relax mask/bias checks Introduces standardized issue and pull request templates to streamline bug reports, feature proposals, and performance diagnostics. Relaxes validation in variable-length attention forward by dropping dtype/device checks for mask and bias, enabling optional inputs and avoiding unnecessary failures. --- .github/ISSUE_TEMPLATE/bug_report.yml | 69 ++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.yml | 64 +++++++++++++++ .github/ISSUE_TEMPLATE/performance_issue.yml | 79 +++++++++++++++++++ .github/PULL_REQUEST_TEMPLATE/bug_fix.yml | 60 ++++++++++++++ .../PULL_REQUEST_TEMPLATE/feature_support.yml | 60 ++++++++++++++ .../performance_optimization.yml | 61 ++++++++++++++ csrc/flash_dmattn/flash_api.cpp | 7 +- 7 files changed, 395 insertions(+), 5 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/ISSUE_TEMPLATE/performance_issue.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE/bug_fix.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE/feature_support.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..dfb007e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,69 @@ +name: Bug report +description: Create a report to help us improve Flash-DMA +title: "[BUG REPORT] " +labels: + - bug +assignees: + - LoserCheems + - Evanwu1125 + - SNHuan + - Thanksyy + - ftgreat + - zacliu2023 + - juliohsu + - wubingheng111 +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to report an issue. Please fill out the details below so we can reproduce and fix the problem quickly. + - type: textarea + id: bug-description + attributes: + label: Describe the bug + description: Provide a concise description of the incorrect behaviour. + placeholder: Unexpected error when calling flash_dmattn(...) + validations: + required: true + - type: textarea + id: reproduction + attributes: + label: Steps to reproduce + description: Share the minimal steps or code necessary for us to see the failure. + placeholder: | + 1. Import flash_dmattn + 2. Run the snippet below + 3. Observe the error + render: python + validations: + required: true + - type: textarea + id: expected-behavior + attributes: + label: Expected behaviour + description: Tell us what you expected to happen instead. + placeholder: The kernel should return valid attention output without raising an exception. + validations: + required: true + - type: textarea + id: environment + attributes: + label: Environment information + description: Run the following command and paste the full output. + placeholder: | + python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.version.cuda}'); print(f'GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else \"None\"}')" + render: shell + validations: + required: true + - type: textarea + id: additional-context + attributes: + label: Additional context + description: Include sequence lengths, batch sizes, or any other details that might help us debug. + placeholder: Tested with seq_len=8192, batch=2, head_dim=128... + - type: textarea + id: traceback + attributes: + label: Error traceback + description: Paste the full traceback if available. + render: text diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..46a7d39 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,64 @@ +name: Feature request +description: Suggest an idea for FDMA +title: "[FEATURE REQUEST] " +labels: + - feature +assignees: + - LoserCheems + - Evanwu1125 + - SNHuan + - Thanksyy + - ftgreat + - zacliu2023 + - juliohsu + - wubingheng111 +body: + - type: markdown + attributes: + value: | + Help us understand the feature you are proposing and why it matters for Flash-DMA workflows. + - type: textarea + id: problem + attributes: + label: Problem statement + description: Explain the problem or limitation that motivates this feature request. + placeholder: I am limited by... + validations: + required: true + - type: textarea + id: proposed-solution + attributes: + label: Proposed solution + description: Describe the feature or behaviour you would like to see. + placeholder: Introduce a kernel path that... + validations: + required: true + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: List any other approaches you have evaluated and why they are insufficient. + placeholder: I tried using... + - type: textarea + id: implementation + attributes: + label: Implementation details + description: Call out potential CUDA/Python changes, performance implications, or compatibility considerations. + placeholder: Requires updates to flash_dmattn_interface and CUDA op... + - type: textarea + id: use-case + attributes: + label: Use case + description: Describe the workloads or scenarios that would benefit from this feature. + placeholder: Long-context code completion with... + - type: textarea + id: references + attributes: + label: Related work + description: Share links to papers, repositories, or prior art that inspired this request. + placeholder: Paper link or repository URL + - type: textarea + id: additional-context + attributes: + label: Additional context + description: Add any extra information or screenshots that may help us understand the request. diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml new file mode 100644 index 0000000..6708f42 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/performance_issue.yml @@ -0,0 +1,79 @@ +name: Performance issue +description: Report performance problems or optimisation opportunities +title: "[PERFORMANCE] " +labels: + - performance +assignees: + - LoserCheems + - Evanwu1125 + - SNHuan + - Thanksyy + - ftgreat + - zacliu2023 + - juliohsu + - wubingheng111 +body: + - type: markdown + attributes: + value: | + Provide enough detail about performance regressions or optimisation opportunities so we can reproduce and diagnose them. + - type: textarea + id: issue-description + attributes: + label: Performance issue description + description: Summarise the performance problem. + placeholder: Forward latency increases when... + validations: + required: true + - type: textarea + id: current-performance + attributes: + label: Current performance metrics + description: Share benchmark numbers and configuration (sequence length, batch size, heads, head dimension, throughput, memory usage). + placeholder: | + Sequence length: 8192 + Batch size: 2 + Heads: 32 + Head dim: 128 + Speed: 15.2 ms/iteration + Memory: 8.5 GB + validations: + required: true + - type: textarea + id: expected-performance + attributes: + label: Expected performance + description: Explain what performance you expect and the baseline you are comparing against. + placeholder: Expect <10 ms/iteration based on Flash Attention benchmark... + - type: textarea + id: environment + attributes: + label: Environment information + description: Run the following command and paste the output. + placeholder: | + python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.version.cuda}'); print(f'GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else \"None\"}')" + render: shell + validations: + required: true + - type: textarea + id: benchmark-code + attributes: + label: Benchmark code + description: Provide the code snippet or script used to measure performance. + render: python + - type: textarea + id: profiling + attributes: + label: Profiling information + description: Include relevant excerpts from nsys, nvprof, or PyTorch profiler if available. + - type: textarea + id: system-info + attributes: + label: System information + description: GPU model, compute capability, CPU, RAM, and other hardware details. + placeholder: RTX 4090 24GB, compute capability 8.9, Intel i9-14900K, 64GB RAM + - type: textarea + id: additional-context + attributes: + label: Additional context + description: Mention regressions, different batch sizes, attention patterns, or other observations. diff --git a/.github/PULL_REQUEST_TEMPLATE/bug_fix.yml b/.github/PULL_REQUEST_TEMPLATE/bug_fix.yml new file mode 100644 index 0000000..766415f --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/bug_fix.yml @@ -0,0 +1,60 @@ +name: Bug Fix +description: Fix a bug with clear reproduction, scope, and tests +title: "[BUG FIX] " +labels: + - bug +body: + - type: markdown + attributes: + value: | + Thanks for contributing a bug fix! Please complete the sections below so reviewers can understand and verify the change quickly. + - type: textarea + id: summary + attributes: + label: Summary + description: What bug is fixed and what parts of the codebase are impacted? + placeholder: Resolves crash when... + validations: + required: true + - type: textarea + id: root-cause + attributes: + label: Root cause + description: Briefly describe the underlying issue. + placeholder: The kernel assumed... + - type: textarea + id: changes + attributes: + label: Changes + description: Highlight the notable code-level modifications. + placeholder: Updated flash_dmattn_interface to... + validations: + required: true + - type: textarea + id: reproduction + attributes: + label: Reproduction steps or MRE + description: Provide steps or a minimal snippet that reproduces the original bug. + render: python + - type: textarea + id: tests + attributes: + label: Tests + description: List the tests you added or ran and their results. + placeholder: Ran benchmarks/forward_equivalence.py; added unit test... + validations: + required: true + - type: textarea + id: compatibility + attributes: + label: Compatibility + description: Note any migration concerns or backwards compatibility considerations. + - type: checkboxes + id: checklist + attributes: + label: Checklist + options: + - label: Linked issue provided + - label: Adds or updates tests + - label: Updates docs if needed + - label: No performance regressions observed diff --git a/.github/PULL_REQUEST_TEMPLATE/feature_support.yml b/.github/PULL_REQUEST_TEMPLATE/feature_support.yml new file mode 100644 index 0000000..46bf35d --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/feature_support.yml @@ -0,0 +1,60 @@ +name: Feature Support +description: Introduce a new feature with design context and tests +title: "[FEATURE SUPPORT] " +labels: + - feature +body: + - type: markdown + attributes: + value: | + Share enough detail about the new feature so reviewers can evaluate scope, design, and testing. + - type: textarea + id: summary + attributes: + label: Summary + description: What feature is being added and why? + placeholder: Adds configurable... + validations: + required: true + - type: textarea + id: design + attributes: + label: Design + description: Outline the design or architecture and mention alternatives considered. + placeholder: Uses new backend selection flow... + - type: textarea + id: changes + attributes: + label: Changes + description: Describe new or changed public APIs, configuration, or CLI behaviour. + placeholder: Adds flash_dmattn.feature_flag... + validations: + required: true + - type: textarea + id: implementation-notes + attributes: + label: Implementation notes + description: Highlight tricky parts or noteworthy implementation details. + - type: textarea + id: tests + attributes: + label: Tests + description: List unit or integration tests you added or updated and how you validated them. + placeholder: Ran benchmarks/forward_equivalence.py; added example in... + validations: + required: true + - type: textarea + id: docs + attributes: + label: Documentation + description: Mention doc updates or examples that accompany this feature. + - type: checkboxes + id: checklist + attributes: + label: Checklist + options: + - label: Linked issue provided + - label: API stabilised + - label: Tests added or updated + - label: Docs added or updated + - label: No known performance regressions diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml new file mode 100644 index 0000000..8c0b688 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml @@ -0,0 +1,61 @@ +name: Performance Optimization +description: Optimise performance with benchmark evidence +title: "[PERFORMANCE OPTIMIZATION] " +labels: + - performance +body: + - type: markdown + attributes: + value: | + Document the optimisation, methodology, and results so reviewers can validate gains and correctness. + - type: textarea + id: summary + attributes: + label: Summary + description: What is optimised and why? + placeholder: Improves forward latency for... + validations: + required: true + - type: textarea + id: baseline + attributes: + label: Baseline metrics + description: Provide the current performance numbers and environment. + placeholder: Baseline throughput 150 tok/s on H100 with... + validations: + required: true + - type: textarea + id: approach + attributes: + label: Approach + description: Describe the optimisation techniques used. + placeholder: Introduced block-wise accumulation... + - type: textarea + id: results + attributes: + label: Results + description: Share before/after benchmarks and how to reproduce them. + placeholder: | + Before: 15.2 ms/iteration (benchmark command) + After: 9.8 ms/iteration (benchmark command) + validations: + required: true + - type: textarea + id: impact + attributes: + label: Impact + description: Note memory, throughput trade-offs, or hardware-specific considerations. + - type: textarea + id: risks + attributes: + label: Risks + description: Highlight edge cases, correctness risks, or gating tests added. + - type: checkboxes + id: checklist + attributes: + label: Checklist + options: + - label: Linked issue provided + - label: Benchmarks included and reproducible + - label: No accuracy regression + - label: Docs updated where needed diff --git a/csrc/flash_dmattn/flash_api.cpp b/csrc/flash_dmattn/flash_api.cpp index 8e68df2..c033d73 100644 --- a/csrc/flash_dmattn/flash_api.cpp +++ b/csrc/flash_dmattn/flash_api.cpp @@ -584,14 +584,11 @@ mha_varlen_fwd( TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, "FlashDynamicMaskAttention only support fp16 and bf16 data type"); TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); - TORCH_CHECK(mask.dtype() == torch::kBool, "mask must have dtype bool"); - TORCH_CHECK(bias.dtype() == q_dtype, "bias must have the same dtype as inputs"); TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); - CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); CHECK_DEVICE(mask); CHECK_DEVICE(bias); - CHECK_DEVICE(cu_seqlens_q); - CHECK_DEVICE(cu_seqlens_k); + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k); at::Tensor block_table; // const bool paged_KV = block_table_.has_value(); From 09974b7b94a3e9e161f25f4ce3733dbd247f3725 Mon Sep 17 00:00:00 2001 From: Jingze Shi Date: Fri, 10 Oct 2025 15:41:48 +0800 Subject: [PATCH 2/5] Update .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml index 8c0b688..2eb165d 100644 --- a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml +++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml @@ -1,5 +1,5 @@ name: Performance Optimization -description: Optimise performance with benchmark evidence +description: Optimize performance with benchmark evidence title: "[PERFORMANCE OPTIMIZATION] " labels: - performance From b3d5308d33f2739dc7d219823fe2ad48f0503118 Mon Sep 17 00:00:00 2001 From: Jingze Shi Date: Fri, 10 Oct 2025 15:41:55 +0800 Subject: [PATCH 3/5] Update .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml index 2eb165d..3c106a0 100644 --- a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml +++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml @@ -28,7 +28,7 @@ body: id: approach attributes: label: Approach - description: Describe the optimisation techniques used. + description: Describe the optimization techniques used. placeholder: Introduced block-wise accumulation... - type: textarea id: results From d73f150124b668f8ee0727d848d167e5ca04af1a Mon Sep 17 00:00:00 2001 From: Jingze Shi Date: Fri, 10 Oct 2025 15:42:02 +0800 Subject: [PATCH 4/5] Update .github/ISSUE_TEMPLATE/performance_issue.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/performance_issue.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml index 6708f42..c3ecca4 100644 --- a/.github/ISSUE_TEMPLATE/performance_issue.yml +++ b/.github/ISSUE_TEMPLATE/performance_issue.yml @@ -16,7 +16,7 @@ body: - type: markdown attributes: value: | - Provide enough detail about performance regressions or optimisation opportunities so we can reproduce and diagnose them. + Provide enough detail about performance regressions or optimization opportunities so we can reproduce and diagnose them. - type: textarea id: issue-description attributes: From 3689550f5ad843a5176d443ae6d7c60247b0a7a6 Mon Sep 17 00:00:00 2001 From: Jingze Shi Date: Fri, 10 Oct 2025 15:42:15 +0800 Subject: [PATCH 5/5] Update .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE/performance_optimization.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml index 3c106a0..5a00c15 100644 --- a/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml +++ b/.github/PULL_REQUEST_TEMPLATE/performance_optimization.yml @@ -12,7 +12,7 @@ body: id: summary attributes: label: Summary - description: What is optimised and why? + description: What is optimized and why? placeholder: Improves forward latency for... validations: required: true