Skip to content

Commit 1b91f41

Browse files
committed
Merge remote-tracking branch 'origin/dns_fix' into dns_fix
2 parents c2e9000 + 2bdf482 commit 1b91f41

File tree

5 files changed

+273
-20
lines changed

5 files changed

+273
-20
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# dae DNS 改进计划 v2 - 开发执行文档
2+
3+
> baseline: mosDNS(`.plan/dae_vs_mosdns_table.csv` / `.plan/dae_vs_mosDNS_findings.md`
4+
5+
## 执行原则
6+
1. 严格串行:Tn 完成实现并完成任务级测试记录后,才进入 Tn+1。
7+
2. 每个任务必须包含:代码变更、任务级测试、测试结论。
8+
3. 里程碑回归:阶段任务全部完成后进行一次回归测试。
9+
10+
## 任务拆解与落地
11+
12+
### T1(P0-1):修复 UDP forwarder 连接回收
13+
- 变更点:
14+
- `DoUDP.ForwardDNS` 拨号成功后保存 `d.conn`
15+
- `DoUDP.Close` 关闭后置 `d.conn=nil`,确保幂等回收。
16+
- 目标:降低 UDP 连接泄漏导致的 FD/端口压力。
17+
18+
### T2(P1-1):失败路径接入 timeout 健康反馈
19+
- 变更点:
20+
- `dialSend` 在 forwarder 返回 timeout 错误时调用 `timeoutExceedCallback`
21+
- 新增 `isTimeoutError` 统一识别 `context deadline exceeded` / `net.Error.Timeout()`
22+
- 目标:让健康度系统尽快降权不健康路径。
23+
24+
### T3(P1-3):tcp+udp 增加同查询 UDP→TCP fallback
25+
- 变更点:
26+
- 新增 `tcpFallbackDialArgument`:仅在 upstream 为 `tcp+udp` 且首发 UDP timeout 时切换 TCP。
27+
- `dialSend` 在一次查询内执行一次 fallback 重试,避免无限重试放大。
28+
- 目标:降低 UDP 瞬时抖动造成的直接失败。
29+
30+
### T4(P1-4):统一上下文/超时语义
31+
- 变更点:
32+
- DoH 请求改为 `http.NewRequestWithContext`
33+
- `sendStreamDNS` 增加 `ctx` 入参,优先使用 `ctx.Deadline()` 设置 stream deadline,并在 I/O 前后检查 `ctx`
34+
- 目标:提升取消及时性,降低尾延迟。
35+
36+
### T5(P2-5):ipversion_prefer 从固定并发双查改为“优先+条件补查”
37+
- 变更点:
38+
- `Handle_` 对 A/AAAA 请求改为先查首选 qtype;仅当需要时再补查另一族。
39+
- 目标:降低上游请求放大与高压下 timeout 叠加。
40+
41+
## 本轮范围说明
42+
- 本轮完成 T1~T5。
43+
- P2-6(DNS 维度指标与自适应)需要较大横切改造(指标面板+选择器反馈回路),建议下一迭代单独实施。

.plan/test-log.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# DNS 改进测试记录
2+
3+
## T1(UDP 连接回收)
4+
- 命令:`rg -n "d\.conn = conn|d\.conn = nil" control/dns.go`
5+
- 结果:命中 `DoUDP.ForwardDNS``d.conn = conn``DoUDP.Close``d.conn = nil`
6+
- 结论:通过(实现与预期一致)。
7+
8+
## T2(超时反馈闭环)
9+
- 命令:`rg -n "timeoutExceedCallback|isTimeoutError\(" control/dns_control.go`
10+
- 结果:命中 `dialSend` 失败路径回调上报,以及 `isTimeoutError` 超时识别函数。
11+
- 结论:通过(失败可反馈到健康度系统)。
12+
13+
## T3(tcp+udp 同查询 fallback)
14+
- 命令:`rg -n "tcpFallbackDialArgument|fallbackForwarder" control/dns_control.go`
15+
- 结果:命中 UDP 失败后 TCP fallback 逻辑及一次性 fallback 执行路径。
16+
- 结论:通过(具备同查询协议兜底能力)。
17+
18+
## T4(上下文/超时语义统一)
19+
- 命令:`rg -n "NewRequestWithContext|sendHttpDNS\(|sendStreamDNS\(ctx" control/dns.go`
20+
- 结果:DoH 使用 `http.NewRequestWithContext`;stream DNS 调用与实现均带 `ctx`
21+
- 结论:通过(超时/取消语义已向协议层传递)。
22+
23+
## T5(ipversion_prefer 条件补查)
24+
- 命令:`rg -n "Query preferred qtype first|handle_\(dnsMessage2|done := make\(chan" control/dns_control.go`
25+
- 结果:命中“先查首选再条件补查”路径;未再出现旧版并发双查 `done` channel 逻辑。
26+
- 结论:通过(请求放大被抑制)。
27+
28+
## 里程碑回归(代码级)
29+
- 命令:`go test ./control -run 'Test(IsTimeoutError|TcpFallbackDialArgument|SendStreamDNSRespectsContextCancelBeforeIO)' -count=1`
30+
- 结果:失败,原因是环境无法从 `proxy.golang.org` 拉取依赖(`github.com/daeuniverse/outbound` 403 Forbidden)。
31+
- 结论:受环境限制,未完成自动化回归;本轮以静态实现校验作为替代。

control/dns.go

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* SPDX-License-Identifier: AGPL-3.0-only
33
* Copyright (c) 2022-2025, daeuniverse Organization <dae@v2raya.org>
4-
*/
4+
*/
55

66
package control
77

@@ -80,11 +80,11 @@ func (d *DoH) ForwardDNS(ctx context.Context, data []byte) (*dnsmessage.Msg, err
8080
if d.client == nil {
8181
d.client = d.getClient()
8282
}
83-
msg, err := sendHttpDNS(d.client, d.dialArgument.bestTarget.String(), &d.Upstream, data)
83+
msg, err := sendHttpDNS(ctx, d.client, d.dialArgument.bestTarget.String(), &d.Upstream, data)
8484
if err != nil {
8585
// If failed to send DNS request, we should try to create a new client.
8686
d.client = d.getClient()
87-
msg, err = sendHttpDNS(d.client, d.dialArgument.bestTarget.String(), &d.Upstream, data)
87+
msg, err = sendHttpDNS(ctx, d.client, d.dialArgument.bestTarget.String(), &d.Upstream, data)
8888
if err != nil {
8989
return nil, err
9090
}
@@ -196,7 +196,7 @@ func (d *DoQ) ForwardDNS(ctx context.Context, data []byte) (*dnsmessage.Msg, err
196196
// thanks https://github.com/natesales/q/blob/1cb2639caf69bd0a9b46494a3c689130df8fb24a/transport/quic.go#L97
197197
binary.BigEndian.PutUint16(data[0:2], 0)
198198

199-
msg, err := sendStreamDNS(stream, data)
199+
msg, err := sendStreamDNS(ctx, stream, data)
200200
if err != nil {
201201
return nil, err
202202
}
@@ -259,7 +259,7 @@ func (d *DoTLS) ForwardDNS(ctx context.Context, data []byte) (*dnsmessage.Msg, e
259259
}
260260
d.conn = tlsConn
261261

262-
return sendStreamDNS(tlsConn, data)
262+
return sendStreamDNS(ctx, tlsConn, data)
263263
}
264264

265265
func (d *DoTLS) Close() error {
@@ -287,7 +287,7 @@ func (d *DoTCP) ForwardDNS(ctx context.Context, data []byte) (*dnsmessage.Msg, e
287287
}
288288

289289
d.conn = conn
290-
return sendStreamDNS(conn, data)
290+
return sendStreamDNS(ctx, conn, data)
291291
}
292292

293293
func (d *DoTCP) Close() error {
@@ -313,6 +313,7 @@ func (d *DoUDP) ForwardDNS(ctx context.Context, data []byte) (*dnsmessage.Msg, e
313313
if err != nil {
314314
return nil, err
315315
}
316+
d.conn = conn
316317

317318
timeout := 5 * time.Second
318319
_ = conn.SetDeadline(time.Now().Add(timeout))
@@ -362,12 +363,14 @@ func (d *DoUDP) ForwardDNS(ctx context.Context, data []byte) (*dnsmessage.Msg, e
362363

363364
func (d *DoUDP) Close() error {
364365
if d.conn != nil {
365-
return d.conn.Close()
366+
err := d.conn.Close()
367+
d.conn = nil
368+
return err
366369
}
367370
return nil
368371
}
369372

370-
func sendHttpDNS(client *http.Client, target string, upstream *dns.Upstream, data []byte) (respMsg *dnsmessage.Msg, err error) {
373+
func sendHttpDNS(ctx context.Context, client *http.Client, target string, upstream *dns.Upstream, data []byte) (respMsg *dnsmessage.Msg, err error) {
371374
// disable redirect https://github.com/daeuniverse/dae/pull/649#issuecomment-2379577896
372375
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
373376
return fmt.Errorf("do not use a server that will redirect, upstream: %v", upstream.String())
@@ -384,7 +387,7 @@ func sendHttpDNS(client *http.Client, target string, upstream *dns.Upstream, dat
384387
q.Set("dns", base64.RawURLEncoding.EncodeToString(data))
385388
serverURL.RawQuery = q.Encode()
386389

387-
req, err := http.NewRequest(http.MethodGet, serverURL.String(), nil)
390+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, serverURL.String(), nil)
388391
if err != nil {
389392
return nil, err
390393
}
@@ -406,7 +409,19 @@ func sendHttpDNS(client *http.Client, target string, upstream *dns.Upstream, dat
406409
return &msg, nil
407410
}
408411

409-
func sendStreamDNS(stream io.ReadWriter, data []byte) (respMsg *dnsmessage.Msg, err error) {
412+
func sendStreamDNS(ctx context.Context, stream io.ReadWriter, data []byte) (respMsg *dnsmessage.Msg, err error) {
413+
type streamDeadliner interface {
414+
SetDeadline(t time.Time) error
415+
}
416+
if deadliner, ok := stream.(streamDeadliner); ok {
417+
if deadline, ok := ctx.Deadline(); ok {
418+
_ = deadliner.SetDeadline(deadline)
419+
}
420+
}
421+
if err = ctx.Err(); err != nil {
422+
return nil, err
423+
}
424+
410425
// We should write two byte length in the front of stream DNS request.
411426
bReq := pool.Get(2 + len(data))
412427
defer pool.Put(bReq)
@@ -416,11 +431,17 @@ func sendStreamDNS(stream io.ReadWriter, data []byte) (respMsg *dnsmessage.Msg,
416431
if err != nil {
417432
return nil, fmt.Errorf("failed to write DNS req: %w", err)
418433
}
434+
if err = ctx.Err(); err != nil {
435+
return nil, err
436+
}
419437

420438
// Read two byte length.
421439
if _, err = io.ReadFull(stream, bReq[:2]); err != nil {
422440
return nil, fmt.Errorf("failed to read DNS resp payload length: %w", err)
423441
}
442+
if err = ctx.Err(); err != nil {
443+
return nil, err
444+
}
424445
respLen := int(binary.BigEndian.Uint16(bReq))
425446
// Try to reuse the buf.
426447
var buf []byte

control/dns_control.go

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package control
77

88
import (
99
"context"
10+
"errors"
1011
"fmt"
1112
"math"
1213
"net"
@@ -387,7 +388,7 @@ func (c *DnsController) Handle_(dnsMessage *dnsmessage.Msg, req *udpRequest) (er
387388
return c.handle_(dnsMessage, req, true)
388389
}
389390

390-
// Try to make both A and AAAA lookups.
391+
// Query preferred qtype first, and only fallback to the opposite qtype when needed.
391392
dnsMessage2 := deepcopy.Copy(dnsMessage).(*dnsmessage.Msg)
392393
dnsMessage2.Id = uint16(fastrand.Intn(math.MaxUint16))
393394
var qtype2 uint16
@@ -401,13 +402,7 @@ func (c *DnsController) Handle_(dnsMessage *dnsmessage.Msg, req *udpRequest) (er
401402
}
402403
dnsMessage2.Question[0].Qtype = qtype2
403404

404-
done := make(chan struct{})
405-
go func() {
406-
_ = c.handle_(dnsMessage2, req, false)
407-
done <- struct{}{}
408-
}()
409405
err = c.handle_(dnsMessage, req, false)
410-
<-done
411406
if err != nil {
412407
return err
413408
}
@@ -422,12 +417,21 @@ func (c *DnsController) Handle_(dnsMessage *dnsmessage.Msg, req *udpRequest) (er
422417
return c.sendReject_(dnsMessage, req)
423418
}
424419
// resp is valid.
425-
cache2 := c.LookupDnsRespCache(c.cacheKey(qname, qtype2), true)
426-
if c.qtypePrefer == qtype || cache2 == nil || !cache2.IncludeAnyIp() {
420+
if c.qtypePrefer == qtype {
427421
return sendPkt(c.log, resp, req.realDst, req.realSrc, req.src, req.lConn)
428-
} else {
422+
}
423+
424+
cache2 := c.LookupDnsRespCache(c.cacheKey(qname, qtype2), true)
425+
if cache2 == nil || !cache2.IncludeAnyIp() {
426+
if err = c.handle_(dnsMessage2, req, false); err != nil {
427+
return err
428+
}
429+
cache2 = c.LookupDnsRespCache(c.cacheKey(qname, qtype2), true)
430+
}
431+
if cache2 != nil && cache2.IncludeAnyIp() {
429432
return c.sendReject_(dnsMessage, req)
430433
}
434+
return sendPkt(c.log, resp, req.realDst, req.realSrc, req.src, req.lConn)
431435
}
432436

433437
func (c *DnsController) handle_(
@@ -601,6 +605,31 @@ func (c *DnsController) dialSend(invokingDepth int, req *udpRequest, data []byte
601605
}
602606

603607
respMsg, err = forwarder.ForwardDNS(ctxDial, data)
608+
if err != nil {
609+
if c.timeoutExceedCallback != nil && isTimeoutError(err) {
610+
c.timeoutExceedCallback(dialArgument, err)
611+
}
612+
if fallbackDialArgument := tcpFallbackDialArgument(upstream, dialArgument, err); fallbackDialArgument != nil {
613+
fallbackForwarder, fallbackErr := newDnsForwarder(upstream, *fallbackDialArgument)
614+
if fallbackErr != nil {
615+
return err
616+
}
617+
defer fallbackForwarder.Close()
618+
respMsg, fallbackErr = fallbackForwarder.ForwardDNS(ctxDial, data)
619+
if fallbackErr == nil {
620+
dialArgument = fallbackDialArgument
621+
err = nil
622+
} else {
623+
if c.timeoutExceedCallback != nil && isTimeoutError(fallbackErr) {
624+
c.timeoutExceedCallback(fallbackDialArgument, fallbackErr)
625+
}
626+
return fallbackErr
627+
}
628+
} else {
629+
return err
630+
}
631+
}
632+
604633
if err != nil {
605634
return err
606635
}
@@ -691,3 +720,32 @@ func (c *DnsController) dialSend(invokingDepth int, req *udpRequest, data []byte
691720
}
692721
return nil
693722
}
723+
724+
func isTimeoutError(err error) bool {
725+
if err == nil {
726+
return false
727+
}
728+
if errors.Is(err, context.DeadlineExceeded) {
729+
return true
730+
}
731+
var netErr net.Error
732+
if errors.As(err, &netErr) {
733+
return netErr.Timeout()
734+
}
735+
return false
736+
}
737+
738+
func tcpFallbackDialArgument(upstream *dns.Upstream, dialArgument *dialArgument, err error) *dialArgument {
739+
if upstream == nil || upstream.Scheme != dns.UpstreamScheme_TCP_UDP {
740+
return nil
741+
}
742+
if dialArgument == nil || dialArgument.l4proto != consts.L4ProtoStr_UDP {
743+
return nil
744+
}
745+
if !isTimeoutError(err) {
746+
return nil
747+
}
748+
fallback := *dialArgument
749+
fallback.l4proto = consts.L4ProtoStr_TCP
750+
return &fallback
751+
}

0 commit comments

Comments
 (0)