xscan 旧版本在知识星球开源了,看一看,主要看 XSS 检测逻辑
项目结构 1 2 3 4 5 6 7 8 9 10 11 12 xscan-1.2 / ├── cmd/ xscan/ ├── core/ │ ├── refxss/ │ ├── spider/ │ ├── parser/ │ ├── httpx/ │ ├── queue/ │ ├── stringset/ │ └── logger/ ├── go.mod └── readme.md
源码学习 队列模块 core/queuecore/queue/leveldb_queue.go => 基于 goleveldb 的数据存储core/queue/queue.go => Go 实现队列数据结构syndtr/goleveldb: LevelDB key/value database in Go. 一个键值数据库,基于文件实现存储 leveldb_queue.go 封装:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 package queue import ( "github.com/syndtr/goleveldb/leveldb" "sync" ) type LevelQueue struct { Db *leveldb.DB lock sync.Mutex } func NewLevelQueue () (*LevelQueue, error ) { db, err := leveldb.OpenFile("./db/block.db" , nil ) if err != nil { return nil , err } return &LevelQueue{Db: db, lock: sync.Mutex{}}, nil } func (l *LevelQueue) Close() { l.Db.Close() } func (l *LevelQueue) Push(v []byte ) error { return l.Db.Put(v, nil , nil ) } func (l *LevelQueue) Len() int { db := l.Db iter := db.NewIterator(nil , nil ) index := 0 for iter.Next() { index += 1 } iter.Release() return index }
爬虫模块 xscan 采用的是静态爬虫 + 第三方的 Wayback,和之前看过的 gospider 类似不过也有不同。
静态爬虫 core\spider\spider.go Start => feed => Visit => handleResult 主要逻辑在 feed() 中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 func (c *Crawler) feed() { var wg sync.WaitGroup c.feedChain = make (chan SpiderOutput) p, _ := ants.NewPoolWithFunc(c.processNum, func (i interface {}) { s := i.(SpiderOutput) c.Visit(s) wg.Done() }) defer p.Release() go func () { maxSize := 100000 index := 0 db := c.queue.Db iter := db.NewIterator(nil , nil ) for iter.Next() { if index > maxSize { break } v := Decode(iter.Key()) c.feedChain <- v db.Delete(iter.Key(), nil ) index += 1 } iter.Release() close (c.feedChain) }() for r := range c.feedChain { n := r if c.callback != nil { c.callback(n) } wg.Add(1 ) _ = p.Invoke(n) } time.Sleep(time.Second * 5 ) wg.Wait() }
就是一个不断深入爬取的过程。 然后看 Visit 的逻辑:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 func (c *Crawler) Visit(p SpiderOutput) { c.limitSecond.Take() output := p.Output req, err := c.hp.NewRequest(p.Method, output, nil , nil ) if err != nil { logrus.WithError(err).Debugln() return } referer := req.Header.Get("Referer" ) if referer == "" { req.Header.Set("Referer" , p.Input) } resp, err := c.hp.Do(req) if err != nil { logrus.WithError(err).Debugf("spider请求失败:%s" , req.URL.String()) return } c.handleResult(resp) }
handleResult 逻辑可以看出 1.2 的静态爬虫比较简陋,只是 href 和表单,和 gospider 有差距。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 func (c *Crawler) handleResult(nResp *httpx.Response) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(nResp.Text)) if err != nil { logrus.WithError(err).Warningf("goquery解析失败" ) return } reqUrl := nResp.Req.URL doc.Find("[href]" ).Each(func (i int , s *goquery.Selection) { href, ok := s.Attr("href" ) if ok { urlString := UrlJoin(reqUrl, href) if c.filterUrl(urlString, httpx.GET) { return } out := SpiderOutput{ Input: reqUrl.String(), Source: "href" , Output: urlString, Header: header2map(nResp.Req.Header), Method: httpx.GET, Body: "" , } c.queue.Push(Encode(out)) } }) doc.Find("form[action]" ).Each(func (i int , s *goquery.Selection) { action, ok := s.Attr("action" ) method, ok2 := s.Attr("method" ) if !ok2 { method = "GET" } method = strings.ToUpper(method) if ok { action_url := UrlJoin(nResp.Req.URL, action) if c.filterUrl(action_url, method) { return } p := url.Values{} s.Find("input[name]" ).Each(func (i int , selection *goquery.Selection) { name, exist := selection.Attr("name" ) v, exist2 := selection.Attr("value" ) if !exist { return } if !exist2 { v = core.RandomStr(4 ) } p.Set(name, v) }) nResp.Req.Header.Set("Content-Type" , "application/x-www-form-urlencoded" ) out := SpiderOutput{ Input: reqUrl.String(), Source: "form" , Output: action_url, Header: header2map(nResp.Req.Header), Method: method, Body: p.Encode(), } c.queue.Push(Encode(out)) } })
不过整体的爬虫队列是基于 leveldb 文件存储实现的,而不是都在内存中,这个可以借鉴。handleResult 获取到的链接都封装为 SpiderOutput 然后放到队列中,feed() [2] 处的协程会拿出来再丢给回调函数。 回调函数在入口处定义, 把数据封装好再提交到携程池中继续爬取,直到 10w:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 func spiderCallback (out spider.SpiderOutput) { logrus.WithField("URL" , out.Output).WithField("Input" , out.Input).Debugln() var postdata []byte if out.Body != "" { postdata = []byte (out.Body) } p := refxss.Parameter{ Url: out.Output, Method: out.Method, POST: postdata, Headers: out.Header, } wg.Add() _ = xssPool.Invoke(p) }
第三方 在 core/spider/source.go 中定义了通过 web.archive.org 来获取链接,不过好像并没有使用:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 func getWaybackURLs (domain string ) ([]string , error ) { res, err := http.Get( fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&collapse=urlkey" , domain), ) if err != nil { return []string {}, err } raw, err := ioutil.ReadAll(res.Body) res.Body.Close() if err != nil { return []string {}, err } var wrapper [][]string err = json.Unmarshal(raw, &wrapper) out := make ([]string , 0 , len (wrapper)) skip := true for _, urls := range wrapper { skip = false continue } out = append (out, urls[2 ]) } return out, nil }
解析模块 html core/parser/html/parser.go 结构体定义:
1 2 3 4 5 6 type ResponsePosition struct { Type int TagName string Content string Attrs []html.Attribute }
ParseHtml 解析 HTML 返回 ResponsePosition 列表,主要使用到 html.NewTokenizer 进行解析。 文档地址:html package - golang.org/x/net/html - Go Packages
1 2 3 4 5 6 7 8 9 const ( ErrorToken TokenType = iota TextToken StartTagToken EndTagToken SelfClosingTagToken CommentToken DoctypeToken )
ParseHtml 函数用来解析整个 HTML 把 HTML 转换为 ResponsePosition 的格式 :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 func ParseHtml (body io.Reader) []ResponsePosition { z := html.NewTokenizer(body) var positions []ResponsePosition var tokenizer []ResponsePosition labelType: for { tt := z.Next() switch tt { case html.ErrorToken: break labelType case html.SelfClosingTagToken: tk := z.Token() raw := string (z.Raw()) attrs := tk.Attr for i, attr := range attrs { if attr.Val == "" { continue } kIndex := strings.Index(raw, found) if kIndex != -1 { vIndex := core.IndexAt(raw, attr.Val, kIndex) - 1 if vIndex > -1 && (raw[vIndex] == 34 || raw[vIndex] == 39 ) { attr.Val = raw[vIndex : vIndex+2 +len (attr.Val)] } } attrs[i] = attr } current := ResponsePosition{ Type: NULL, TagName: tk.Data, Content: "" , Attrs: attrs, } positions = append (positions, current) case html.StartTagToken: tk := z.Token() raw := string (z.Raw()) attrs := tk.Attr for i, attr := range attrs { if attr.Val == "" { continue } found := attr.Key + "=" kIndex := strings.Index(raw, found) if kIndex != -1 { vIndex := core.IndexAt(raw, attr.Val, kIndex) - 1 if vIndex > -1 && (raw[vIndex] == 34 || raw[vIndex] == 39 ) { attr.Val = raw[vIndex : vIndex+2 +len (attr.Val)] } } attrs[i] = attr } current := ResponsePosition{ Type: NULL, TagName: tk.Data, Content: "" , Attrs: attrs, } positions = append (positions, current) case html.TextToken: if len (positions) > 0 { positions[len (positions)-1 ].Content += string (z.Raw()) } case html.EndTagToken: if len (positions) > 0 { end := len (positions) - 1 current := positions[end] tokenizer = append (tokenizer, current) positions = positions[:end] } case html.CommentToken: tk := z.Token() current := ResponsePosition{ Type: NULL, TagName: "comment" , Content: tk.Data, Attrs: nil , } tokenizer = append (tokenizer, current) } } for len (positions) > 0 { end := len (positions) - 1 current := positions[end] tokenizer = append (tokenizer, current) positions = positions[:end] } return tokenizer }
SearchInputInResponse 方法用于获取报告知道输入的 ResponsePosition 列表: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 func SearchInputInResponse (input string , body io.Reader) ([]ResponsePosition, error ) { tokenizer := ParseHtml(body) var ret []ResponsePosition for _, p := range tokenizer { if strings.Contains(p.Content, input) { if p.TagName == "comment" { ret = append (ret, ResponsePosition{ Type: Comment, TagName: p.TagName, Content: p.Content, Attrs: p.Attrs, }) } else { ret = append (ret, ResponsePosition{ Type: Text, TagName: p.TagName, Content: p.Content, Attrs: p.Attrs, }) } } if strings.Contains(p.TagName, input) { ret = append (ret, ResponsePosition{ Type: Tag, TagName: p.TagName, Content: p.Content, Attrs: p.Attrs, }) } if strings.ToLower(p.TagName) == "style" && strings.Contains(p.Content, input) { ret = append (ret, ResponsePosition{ Type: Style, TagName: p.TagName, Content: p.Content, Attrs: p.Attrs, }) } for _, attr := range p.Attrs { if strings.Contains(attr.Key, input) { ret = append (ret, ResponsePosition{ Type: AttibuteKey, TagName: p.TagName, Content: p.Content, Attrs: p.Attrs, }) } if strings.Contains(attr.Val, input) { ret = append (ret, ResponsePosition{ Type: AttibuteValue, TagName: p.TagName, Content: p.Content, Attrs: p.Attrs, }) } } } return ret, nil }
JS https://zhaomenghuan.js.org/blog/js-ast-principle-reveals.html 使用 github.com/tdewolff/parse/js 做 AST
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 func ParseJs2AstType (src string ) ([]*AstType, error ) { input := parse.NewInputString(src) l := js.NewLexer(input) var ret []*AstType for { tt, text := l.Next() switch tt { case js.ErrorToken: if l.Err() != io.EOF { err := CreateJsParserError(input, l.Err()) return ret, err } return ret, nil default : n := AstType{ Type: tt, Content: string (text), } ret = append (ret, &n) } } }
SearchInputInScript 主要是收集这几种类型:
IdentifierToken:标识符(变量名、函数名、属性名等)
StringToken:字符串,比如 let s = "hello" 中的 "hello"
CommentToken:注释内容
这里用到了 fallthrough ,go 的 switch case 不需要显式的跳出,默认会跳出,fallthrough 的作用就是不跳出,继续向下。不过这里就这 3 个感觉也没必要这么写,可能是方便后续扩充。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 func SearchInputInScript (input string , script string ) ([]*AstType, error ) { astTokens, err := ParseJs2AstType(script) if err != nil { return nil , err } var ret []*AstType for _, token := range astTokens { if !strings.Contains(token.Content, input) { continue } switch token.Type { case js.IdentifierToken: fallthrough case js.StringToken: fallthrough case js.CommentToken: n := *token ret = append (ret, &n) default : err = CreateTokenNotFound(token.Type, token.Content) return ret, err } } return ret, nil }
变量收集,对应着隐藏参数收集,就是从 HTML 中收集一些静态爬虫没有收集到的:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 var regx = regexp.MustCompile(`^[\w\d-_]+$` )func (w *varWalker) Enter(n js.INode) js.IVisitor { switch n := n.(type ) { case *js.VarDecl: for i := range n.List { v, ok := n.List[i].Binding.(*js.Var) if !ok { continue } strV := string (v.Data) if regx.MatchString(strV) { w.vars = append (w.vars, strV) } } case *js.ObjectExpr: for _, p := range n.List { if p.Name != nil { v := p.Name.Literal.String() if regx.MatchString(v) { w.vars = append (w.vars, v) } } } } return w }
XSS检测模块 HTML 每个函数对应不同的 XSS 检测方法:checkHtml html标签未转义,可攻击 :
flag 生成
payload 生成 prefix<flag>
发起请求
搜索 flag 在 html 中的位置
如果 flag 在标签名里面则判断存在 => 之前的 payload 的 flag 就是一个标签
checkStyle ie下可造成xss :
payload:expression(a(%s)) => expression 用于在 IE 中的 CSS 样式中镶嵌 JS 代码
如果位置是样式,并且内容包含 expression 表达式则判断存在
checkTagValue 引号可被闭合,可使用其他事件造成xss
这次检索的是标签里面属性中存在的 XSS
payload:fmt.Sprintf("%s%s", prefix, flag) => 这里的前缀其实就是引号啥的
如果属性的 key 中包含 flag 就判断存在 => 之前是的属性值位置,现在通过引号进行闭合,变成了 key 的位置。a="payload" => a="" flag
代码都差不多:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 func (r *RefXss) checkHtml(target Parameter, position string , k string , prefix string ) { flag := r.generateFlag() payload := fmt.Sprintf("%s<%s>" , prefix, flag) req, resp, err := r.request(target, position, k, payload) if err != nil { logrus.WithError(err).Debugln("请求失败[checkHtml]" ) return } positions, err := html.SearchInputInResponse(flag, strings.NewReader(resp.Text)) if err != nil { logrus.WithError(err).Debugln("html语义解析失败" ) return } for _, p := range positions { if p.Type == html.Tag && strings.Contains(p.TagName, flag) { lines := htmlInLine(resp.Text, payload) item := Item{ Req: req, Resp: resp, XssType: "html" , Desc: "html标签未转义,可攻击" , Position: position, Parameter: k, Payload: fmt.Sprintf("%s=%s" , k, payload), SuggestPayload: fmt.Sprintf("%s=%s" , k, prefix+"<svg onload=alert(1)>//" ), Line: lines, } r.result.Items = append (r.result.Items, item) } } }
JS checkJsString JS 逃逸 应该是在 script 里面的变量什么的可控的场景
payload := fmt.Sprintf("%s-%s-%s", quote, flag, quote)
HTML 解析,获取 script 标签的内容(JS代码)
解析这些 JS 代码获取语法树
如果 ast.Type 是 IdentifierToken 并且内容中有 flag 则 可以使用quote逃逸引号
如果没有逃逸成功,进行转义测试 fmt.Sprintf("\\%s%s", quote, flag)
JS AST 解析失败,表明使用转义符破坏原有语义
解析成功,和 [4] 处类似的检测方式
checkJsComment 可以使用%s逃逸注释
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 func (r *RefXss) checkJsComment(target Parameter, position string , k string , s string ) { flag := r.generateFlag() var payload string var truePayload string if s == "//" { payload = fmt.Sprintf("\n%s//" , flag) truePayload = fmt.Sprintf("\n%s//" , "alert(1)" ) } else if s == "/*" { payload = fmt.Sprintf("*/%s/*" , flag) truePayload = fmt.Sprintf("\n%s//" , "alert(1)" ) } req, resp, err := r.request(target, position, k, payload) if err != nil { logrus.WithError(err).Debugln("请求失败 [checkJsComment]" ) return } positions, err := html.SearchInputInResponse(flag, strings.NewReader(resp.Text)) sources := []string {} for _, p := range positions { if strings.ToLower(p.TagName) == "script" { sources = append (sources, p.Content) } } content := strings.Join(sources, "\n" ) asts, err := js.SearchInputInScript(flag, content) if err != nil { logrus.WithError(err).Debugln("ast解析失败" ) return } lines := htmlInLine(content, flag) for _, ast := range asts { if ast.Type == js2.IdentifierToken && strings.Contains(ast.Content, flag) { item := Item{ Req: req, Resp: resp, XssType: "Js CommentToken" , Desc: fmt.Sprintf("可以使用%s逃逸注释" , truePayload), Position: position, Parameter: k, Payload: fmt.Sprintf("%s=%s" , k, payload), SuggestPayload: fmt.Sprintf("%s=%s" , k, truePayload), Line: lines, } r.result.Items = append (r.result.Items, item) return } } }
XSS request 模块:针对对应位置设置对应的请求参数,请求用的 projectdiscovery 的库
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 func (r *RefXss) request(target Parameter, position string , k string , v string ) (*retryablehttp.Request, *httpx.Response, error ) { req, err := r.hp.NewRequest(target.Method, target.Url, target.POST, target.Headers) if err != nil { return nil , nil , err } switch position { case InjectGetParameter: query := req.URL.Query() query.Set(k, v) req.URL.RawQuery = query.Encode() case InjectPostParameter: vs, err := url.ParseQuery(string (target.POST)) if err != nil { return nil , nil , err } vs.Set(k, v) req.Body = ioutil.NopCloser(strings.NewReader(vs.Encode())) req, err = retryablehttp.FromRequest(req.Request) if err != nil { return nil , nil , err } case InjectCookieParameter: cookies := req.Cookies() for i, vv := range cookies { key := vv.Name if key == k { cookies[i].Value = v } } req.Header.Set("Cookie" , EncodeCookies(cookies)) case InjectUriParameter: paths := req.URL.Path splits := strings.Split(paths, "/" ) kValue, _ := strconv.Atoi(k) s := splits[kValue] last := strings.LastIndex(s, "." ) if last > -1 { v = v + s[last:] } splits[kValue] = v req.URL.Path = strings.Join(splits, "/" ) case InjectHeaderParameter: req.Header.Set(k, v) } resp, err := r.hp.Do(req) if err != nil { return nil , nil , err } return req, resp, nil }
扫描模块: 实际并没有调用,而是使用 ScanWithHiddenParameter。
隐藏参数收集
HTML 获取 input 的 name
JS AST 获取变量再配合正则进行过滤
添加隐藏参数,调用 handle 进行扫描,只检测 GET
检测其他位置的 XSS ( POST, COOKIE, HEADER, URI )
总结 AST 抽象语法树 XSS