Skip to content

Commit 01e5ebb

Browse files
committed
add wx domain check
1 parent 90ecf6b commit 01e5ebb

File tree

10 files changed

+206
-23
lines changed

10 files changed

+206
-23
lines changed

infoscan/api/api.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ func (a *Api) StartCrawlerJob(urls []string) (name string, id uint) {
3434
}
3535

3636
func (a *Api) Out2Excel(jobID uint, filename string) {
37-
//Crawler.Out2Excel(jobID, a.db, filepath.Join(a.config.ResultPath, filename))
3837
Crawler.Out2Excel(jobID, a.db, filename)
3938
}
4039

infoscan/cmd/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Version: 0.4
33
ResultPath: ./result
44
LogPath: ./log
55
LogLevel: 1 # 日志记录等级,DEBUG:1 WARN:2 INFO:3 ERROR:4 FATAL:5
6-
LogPrintingLevel: 1 #日志打印等级
6+
LogPrintingLevel: 2 #日志打印等级
77
SpiderMaxNum: 5 #最大爬虫数量
88
whitelistFile : whitelist.txt #白名单,白名单不做内容分析
99
Spider:

infoscan/service/Crawler/Crawler.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type crawler struct {
2525
BloomFilter *bloom.Filter
2626
Urls []string
2727
Scheduler *pkg.QueueScheduler[string]
28+
WXC *Processor.WXDomainCheck
2829
}
2930

3031
func NewCrawlerJob(config *config.Config, db dao.IDAO, name string, urls []string) *CrawlerJob {
@@ -40,6 +41,9 @@ func NewCrawlerJob(config *config.Config, db dao.IDAO, name string, urls []strin
4041
Spiders: map[string]*Spider.Spider{},
4142
Urls: urls,
4243
Scheduler: s,
44+
WXC: &Processor.WXDomainCheck{
45+
IProcessorDAO: db,
46+
},
4347
},
4448
}
4549
}
@@ -69,6 +73,7 @@ func (c *CrawlerJob) Run(ctx context.Context) {
6973
c.init()
7074
var wg sync.WaitGroup
7175
cancel, cancelFunc := context.WithCancel(ctx)
76+
go c.WXC.Run()
7277
for i := 0; i < c.config.SpiderMaxNum; i++ {
7378
wg.Add(1)
7479
go func() {
@@ -108,12 +113,13 @@ func (c *CrawlerJob) CallbackFunc(page *dao.Page, body []byte) {
108113
}
109114

110115
func (c *CrawlerJob) createSpider(URL *url.URL) *Spider.Spider {
116+
111117
spider :=
112118
Spider.NewSpider(&c.config.Spider, c.Job.ID, c.DAO).
113119
SetFilter(c.BloomFilter).
114120
SetMainUrl(URL).
115121
SetCallbackFunc(c.CallbackFunc).
116122
SetReqer(HttpSpider.NewHttpSpider(&c.config.Spider.Httpspider)).
117-
SetProcessor(Processor.NewDataProcessor(c.ID, c.DAO, Processor.DefaultHandlerFuncs, c.config.WhitelistFile))
123+
SetProcessor(Processor.NewDataProcessor(c.ID, c.DAO, append(Processor.DefaultHandlerFuncs, c.WXC.Handler), c.config.WhitelistFile))
118124
return spider
119125
}

infoscan/service/Crawler/Out2Excel.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,19 +119,19 @@ func Out2Excel(jobid uint, DAO dao.IDAO, filename string) {
119119
a := []string{}
120120
if f.GetSheetIndex(r.Type) == -1 {
121121
f.NewSheet(r.Type)
122-
raw := map[string]interface{}{}
123-
err := json.Unmarshal([]byte(r.Data), &raw)
124-
if err != nil {
125-
fmt.Println(err.Error())
126-
continue
127-
}
122+
//raw := map[string]interface{}{}
123+
//err := json.Unmarshal([]byte(r.Data), &raw)
124+
//if err != nil {
125+
// fmt.Println(err.Error())
126+
// continue
127+
//}
128128
a = append(a, "URL")
129129
a = append(a, "父URL")
130130
//for k, _ := range raw {
131131
// a = append(a, k)
132132
//}
133133
a = append(a, "数据")
134-
err = f.SetSheetRow(r.Type, "A1", &a)
134+
err := f.SetSheetRow(r.Type, "A1", &a)
135135
if err != nil {
136136
fmt.Println(err.Error())
137137
continue
@@ -173,7 +173,7 @@ func Out2Excel(jobid uint, DAO dao.IDAO, filename string) {
173173
}
174174
a = append(a, url1.URL)
175175
a = append(a, url2.URL)
176-
a = append(a, fmt.Sprintf("%s", r.Data))
176+
a = append(a, r.Data)
177177
err = f.SetSheetRow(r.Type, axis, &a)
178178
if err != nil {
179179
fmt.Println(err.Error())

infoscan/service/Crawler/Processor/ProcessorFunc.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ func EXLinkPF(page *dao.Page, data []byte) (*dao.ProcessResult, error) { //exter
5454
if !page.External {
5555
return &result, errors.New("no data")
5656
}
57+
if strings.Contains(page.Error, "not text") {
58+
return &result, errors.New("no data")
59+
}
5760
if page.Code == 0 {
5861
result.Type = "外部死链"
5962
}

infoscan/service/Crawler/Processor/data_process.go

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,10 @@ func PageFindUrlpressor(ulist []string, iurl string) [][]*url.URL {
9292
var extrls []*url.URL
9393
parse, _ := url.Parse(iurl)
9494
for _, u := range ulist {
95-
if ckSuffixe(u) {
96-
continue
97-
}
9895
if up, err := url.Parse(u); err == nil {
96+
if ckSuffixe(up.Path) {
97+
continue
98+
}
9999
if up.Scheme == "" {
100100
up.Scheme = "http"
101101
}
@@ -105,7 +105,7 @@ func PageFindUrlpressor(ulist []string, iurl string) [][]*url.URL {
105105
extrls = append(extrls, up)
106106
}
107107
} else {
108-
logger.PF(logger.LERROR, "<URLFinder>页面内容中的URL:%s 处理失败:%s,来自页面:%s", u, err.Error(), iurl)
108+
logger.PF(logger.LDEBUG, "<URLFinder>页面内容中的URL:%s 处理失败:%s,来自页面:%s", u, err.Error(), iurl)
109109
}
110110
}
111111
return [][]*url.URL{urls, extrls}
@@ -122,18 +122,17 @@ func HtmlFindUrlpressor(ulist []string, iurl string) [][]*url.URL {
122122
if raUrl == "//" || raUrl == "/" {
123123
continue
124124
}
125-
if strings.Contains(raUrl, "javascript:vo") {
125+
if strings.Contains(raUrl, "javascript:") {
126126
//javascript:void(0)
127127
continue
128128
}
129-
if ckSuffixe(raUrl) {
130-
continue
131-
}
132129
parserulfunc := func(urlstr string, sliec *[]*url.URL) {
133130
if u, err := url.Parse(urlstr); err != nil {
134-
logger.PF(logger.LERROR, "<URLFinder>Html标签属性中的URL:%s 处理失败:%s,来自页面:%s", urlstr, err.Error(), iurl)
131+
logger.PF(logger.LDEBUG, "<URLFinder>Html标签属性中的URL:%s 处理失败:%s,来自页面:%s", urlstr, err.Error(), iurl)
135132
} else if u.Host == "" {
136-
logger.PF(logger.LERROR, "<URLFinder>Html标签属性中的URL:%s 处理失败,来自页面:%s", urlstr, iurl)
133+
logger.PF(logger.LDEBUG, "<URLFinder>Html标签属性中的URL:%s 处理失败,来自页面:%s", urlstr, iurl)
134+
} else if ckSuffixe(u.Path) {
135+
return
137136
} else {
138137
*sliec = append(*sliec, u)
139138
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
package Processor
2+
3+
import (
4+
"GScan/infoscan/dao"
5+
"GScan/pkg"
6+
"GScan/pkg/logger"
7+
"bytes"
8+
"encoding/json"
9+
"errors"
10+
"fmt"
11+
"io"
12+
"net/http"
13+
"net/url"
14+
"regexp"
15+
"strings"
16+
)
17+
18+
type WXDomainCheck struct {
19+
dao.IProcessorDAO
20+
JobID uint
21+
Scheduler pkg.QueueScheduler[*dao.Page]
22+
Client http.Client
23+
}
24+
25+
func (w *WXDomainCheck) Run() {
26+
w.Client = http.Client{}
27+
w.Client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
28+
return http.ErrUseLastResponse
29+
}
30+
w.Scheduler.Init()
31+
w.Scheduler.Run()
32+
workerChan := w.Scheduler.WorkerChan()
33+
for {
34+
w.Scheduler.WorkerReady(workerChan)
35+
select {
36+
case page := <-workerChan:
37+
res, ok := w.check(page.URL)
38+
if !ok {
39+
result := dao.ProcessResult{
40+
Type: "微信域名检测",
41+
JobID: page.JobID,
42+
PageID: page.ID,
43+
Data: res,
44+
}
45+
w.AddResult(&result)
46+
logger.PF(logger.LINFO, "<DataProcessor>[%s]%s :%s", result.Type, page.URL, result.Data)
47+
}
48+
dao.PagePool.Put(page)
49+
}
50+
}
51+
}
52+
53+
func (w *WXDomainCheck) Handler(page *dao.Page, data []byte) (*dao.ProcessResult, error) {
54+
if !page.External {
55+
return nil, errors.New("no data")
56+
}
57+
if strings.Contains(page.Error, "not text") {
58+
return nil, errors.New("no data")
59+
}
60+
npage := dao.PagePool.Get().(*dao.Page)
61+
marshal, _ := json.Marshal(page)
62+
if err := json.Unmarshal(marshal, npage); err != nil {
63+
return nil, err
64+
}
65+
w.Scheduler.Submit(npage)
66+
67+
return nil, errors.New("no data")
68+
}
69+
70+
type WXRESP struct {
71+
Type string `json:"type"`
72+
Title string `json:"title"`
73+
Desc string `json:"desc"`
74+
}
75+
76+
var re = regexp.MustCompile(`(?m)cgiData = (.*?);
77+
</script>`)
78+
79+
func (w *WXDomainCheck) check(url0 string) (string, bool) {
80+
wxurl := fmt.Sprintf("https://mp.weixinbridge.com/mp/wapredirect?url=%s", url.QueryEscape(url0))
81+
request, _ := http.NewRequest("GET", wxurl, nil)
82+
resp, err := w.Client.Do(request)
83+
if err != nil {
84+
return err.Error(), true
85+
}
86+
if resp.StatusCode != 302 {
87+
return "StatusCode!=302 可能被风控", true
88+
}
89+
if Location, ok := resp.Header["Location"]; ok {
90+
if !strings.Contains(Location[0], "weixin110.qq.com") {
91+
return "正常", true
92+
}
93+
wxresp, err := http.Get(Location[0])
94+
if err != nil {
95+
return err.Error(), true
96+
}
97+
all, err := io.ReadAll(wxresp.Body)
98+
if err != nil {
99+
return err.Error(), true
100+
}
101+
re.FindAllSubmatch(all, -1)
102+
submatch := re.FindAllSubmatch(all, -1)
103+
if len(submatch) == 0 {
104+
return "检测失败", true
105+
}
106+
107+
if bytes.Contains(submatch[0][1], []byte("该地址为IP地址")) {
108+
return "IP地址", true
109+
}
110+
jsdata := WXRESP{}
111+
err = json.Unmarshal(submatch[0][1], &jsdata)
112+
if err != nil {
113+
return err.Error(), true
114+
}
115+
if jsdata.Type == "empty" {
116+
return jsdata.Title, false
117+
} else {
118+
return jsdata.Desc, false
119+
}
120+
}
121+
return "检测失败 未找到Loc", true
122+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package Processor
2+
3+
import (
4+
"GScan/infoscan/dao"
5+
"GScan/pkg"
6+
"testing"
7+
)
8+
9+
func Test_wxDominCheck_check(t *testing.T) {
10+
type args struct {
11+
url0 string
12+
}
13+
tests := []struct {
14+
name string
15+
args args
16+
want string
17+
want1 bool
18+
}{
19+
{
20+
name: "vshex",
21+
args: args{url0: "http://vshex.com"},
22+
},
23+
}
24+
25+
for _, tt := range tests {
26+
t.Run(tt.name, func(t *testing.T) {
27+
w := &WXDomainCheck{
28+
Scheduler: pkg.QueueScheduler[*dao.Page]{},
29+
}
30+
w.Run()
31+
got, got1 := w.check(tt.args.url0)
32+
if got != tt.want {
33+
t.Errorf("check() got = %v, want %v", got, tt.want)
34+
}
35+
if got1 != tt.want1 {
36+
t.Errorf("check() got1 = %v, want %v", got1, tt.want1)
37+
}
38+
})
39+
}
40+
}

infoscan/service/Crawler/Spider/engine.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,16 @@ func (s *Spider) Processor(page *dao.Page, body []byte) {
3131
s.AddUrlbypage([]*dao.Page{page})
3232
}
3333
}
34-
logger.PF(logger.LWARN, "<Spider>[%s]%s访问出错(%d),%s", s.Host, page.URL, page.ErrorNum, page.Error)
34+
if !strings.HasPrefix(page.Error, "not text") {
35+
logger.PF(logger.LWARN, "<Spider>[%s]%s访问出错(%d),%s", s.Host, page.URL, page.ErrorNum, page.Error)
36+
}
3537
s.DAO.UpdatePage(page)
3638
return
3739
}
3840
urls := Processor.Findurl(body, page.URL)
39-
logger.PF(logger.LDEBUG, "<Spider>[%s]%s发现内链%d个,外链%d个", s.Host, page.URL, len(urls[0]), len(urls[1]))
41+
if len(urls[0]) > 0 || len(urls[1]) > 0 {
42+
logger.PF(logger.LINFO, "<Spider>[%s]%s发现内链%d个,外链%d个", s.Host, page.URL, len(urls[0]), len(urls[1]))
43+
}
4044
for _, u := range urls[1] {
4145
page.ExtURLList = append(page.ExtURLList, u.String())
4246
}
@@ -82,6 +86,7 @@ func (s *Spider) AddNewPage(urls []*url.URL) ([]*dao.Page, error) {
8286
pg.JobID = s.JobID
8387
pg.Status = "未访问"
8488
pg.Model = gorm.Model{}
89+
pg.ID = 0
8590
pg.URL = surl.String()
8691
pg.Title = ""
8792
pg.Error = ""

pkg/BytePoll.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package pkg
2+
3+
import (
4+
"sync"
5+
)
6+
7+
var BytePoll = sync.Pool{New: func() any {
8+
return []byte{}
9+
}}

0 commit comments

Comments
 (0)