Skip to content

GoSpider 源码学习

date
2024-03-12 21:24:30

项目介绍

项目地址:https://github.com/jaeles-project/gospider

goSpider 是一款静态爬虫,之前读了 crawlergo 动态爬虫,现在来看一看静态爬虫。

介绍如下:

image-20240323004407171

url:

  • sitemap 站点地图
  • robots.txt
  • Wayback Machine、Common Crawl、Virus Total、Alien Vault 从外部获取 url 和 lc/gau 差不多
  • 响应中提取

响应:

  • subdomains
  • aws-s3

项目结构

├── core
   ├── crawler.go              # 爬虫        
   ├── grep.go                 # 提取子域名、S3
   ├── linkfinder.go           # 从响应中寻找路径
   ├── logger.go               # 日志
   ├── othersource.go          # 其他源 ( 外部获取 )
   ├── output.go               # 输出
   ├── robots.go               # 从 robots.txt 中提取
   ├── sitemap.go              # 从站点地图中提取
   ├── utils.go                # 工具
   └── version.go
├── Dockerfile
├── main.go
├── README.md
└── stringset
    ├── filter.go               # 链接判重
    └── set.go

源码学习

crawler.go 是爬行的具体实现,先看一下 NewCrawler:

这里可以看到它使用了 colly 爬虫框架去完成整个爬虫的初始化操作。

func NewCrawler(site *url.URL, cmd *cobra.Command) *Crawler {
    // 获取根域名
    domain := GetDomain(site)
    if domain == "" {
        Logger.Error("Failed to parse domain")
        os.Exit(1)
    }
    Logger.Infof("Start crawling: %s", site)

    quiet, _ := cmd.Flags().GetBool("quiet")
    jsonOutput, _ := cmd.Flags().GetBool("json")
    maxDepth, _ := cmd.Flags().GetInt("depth")
    concurrent, _ := cmd.Flags().GetInt("concurrent")
    delay, _ := cmd.Flags().GetInt("delay")
    randomDelay, _ := cmd.Flags().GetInt("random-delay")
    length, _ := cmd.Flags().GetBool("length")
    raw, _ := cmd.Flags().GetBool("raw")
    subs, _ := cmd.Flags().GetBool("subs")
    // 使用 colly 爬虫
    c := colly.NewCollector(
        colly.Async(true),          // 异步
        colly.MaxDepth(maxDepth),   // 深度
        colly.IgnoreRobotsTxt(),    // 忽略 robots
    )

    // Setup http client
    client := &http.Client{}

    // Set proxy
    proxy, _ := cmd.Flags().GetString("proxy")
    if proxy != "" {
        Logger.Infof("Proxy: %s", proxy)
        pU, err := url.Parse(proxy)
        if err != nil {
            Logger.Error("Failed to set proxy")
        } else {
            // 设置代理 http.ProxyURL 这个东西是我之前不知道的 学到了
            DefaultHTTPTransport.Proxy = http.ProxyURL(pU)
        }
    }

    // Set request timeout
    timeout, _ := cmd.Flags().GetInt("timeout")
    if timeout == 0 {
        Logger.Info("Your input timeout is 0. Gospider will set it to 10 seconds")
        client.Timeout = 10 * time.Second
    } else {
        client.Timeout = time.Duration(timeout) * time.Second
    }

    // Disable redirect
    noRedirect, _ := cmd.Flags().GetBool("no-redirect")
    // 处理重定向
    if noRedirect {
        client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
            nextLocation := req.Response.Header.Get("Location")
            Logger.Debugf("Found Redirect: %s", nextLocation)
            // Allow in redirect from http to https or in same hostname
            // We just check contain hostname or not because we set URLFilter in main collector so if
            // the URL is https://otherdomain.com/?url=maindomain.com, it will reject it
            if strings.Contains(nextLocation, site.Hostname()) {
                Logger.Infof("Redirecting to: %s", nextLocation)
                return nil
            }
            return http.ErrUseLastResponse
        }
    }

    // Set client transport
    client.Transport = DefaultHTTPTransport
    c.SetClient(client)

    // Get headers here to overwrite if "burp" flag used
    // 从 burp 的请求报文中获取 Cookie 和 请求头 设置到 colly 请求中
    burpFile, _ := cmd.Flags().GetString("burp")
    if burpFile != "" {
        bF, err := os.Open(burpFile)
        if err != nil {
            Logger.Errorf("Failed to open Burp File: %s", err)
        } else {
            rd := bufio.NewReader(bF)
            req, err := http.ReadRequest(rd)
            if err != nil {
                Logger.Errorf("Failed to Parse Raw Request in %s: %s", burpFile, err)
            } else {
                // Set cookie
                c.OnRequest(func(r *colly.Request) {
                    r.Headers.Set("Cookie", GetRawCookie(req.Cookies()))
                })
                // Set headers
                c.OnRequest(func(r *colly.Request) {
                    for k, v := range req.Header {
                        r.Headers.Set(strings.TrimSpace(k), strings.TrimSpace(v[0]))
                    }
                })

            }
        }
    }

    // Set cookies 
    cookie, _ := cmd.Flags().GetString("cookie")
    if cookie != "" && burpFile == "" {
        c.OnRequest(func(r *colly.Request) {
            r.Headers.Set("Cookie", cookie)
        })
    }

    // Set headers
    headers, _ := cmd.Flags().GetStringArray("header")
    if burpFile == "" {
        for _, h := range headers {
            headerArgs := strings.SplitN(h, ":", 2)
            headerKey := strings.TrimSpace(headerArgs[0])
            headerValue := strings.TrimSpace(headerArgs[1])
            c.OnRequest(func(r *colly.Request) {
                r.Headers.Set(headerKey, headerValue)
            })
        }
    }

    // Set User-Agent 使用 colly 提供的 UA 头
    randomUA, _ := cmd.Flags().GetString("user-agent")
    switch ua := strings.ToLower(randomUA); {
    case ua == "mobi":
        extensions.RandomMobileUserAgent(c)
    case ua == "web":
        extensions.RandomUserAgent(c)
    default:
        c.UserAgent = ua
    }

    // Set referer 添加 Referer
    extensions.Referer(c)

    // Init Output
    var output *Output
    outputFolder, _ := cmd.Flags().GetString("output")
    if outputFolder != "" {
        filename := strings.ReplaceAll(site.Hostname(), ".", "_")
        output = NewOutput(outputFolder, filename)
    }

    // Init Length Filter
    filterLength_slice := []int{}
    filterLength, _ := cmd.Flags().GetString("filter-length")
    // 长度过滤
    if filterLength != "" {
        lengthArgs := strings.Split(filterLength, ",")
        for i := 0; i < len(lengthArgs); i++ {
            if i, err := strconv.Atoi(lengthArgs[i]); err == nil {
                filterLength_slice = append(filterLength_slice, i)
            }
        }
    }

    // Set url whitelist regex
    reg := ""
    if subs {
        reg = site.Hostname()
    } else {
        reg = "(?:https|http)://" + site.Hostname()
    }

    sRegex := regexp.MustCompile(reg)
    c.URLFilters = append(c.URLFilters, sRegex)

    // Set Limit Rule
    err := c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: concurrent,
        Delay:       time.Duration(delay) * time.Second,
        RandomDelay: time.Duration(randomDelay) * time.Second,
    })
    if err != nil {
        Logger.Errorf("Failed to set Limit Rule: %s", err)
        os.Exit(1)
    }

    // GoSpider default disallowed regex
    // 不爬的东西
    disallowedRegex := `(?i)\.(png|apng|bmp|gif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`
    c.DisallowedURLFilters = append(c.DisallowedURLFilters, regexp.MustCompile(disallowedRegex))

    // Set optional blacklist url regex
    blacklists, _ := cmd.Flags().GetString("blacklist")
    if blacklists != "" {
        c.DisallowedURLFilters = append(c.DisallowedURLFilters, regexp.MustCompile(blacklists))
    }

    // Set optional whitelist url regex
    // 白名单
    whiteLists, _ := cmd.Flags().GetString("whitelist")
    if whiteLists != "" {
        c.URLFilters = make([]*regexp.Regexp, 0)
        c.URLFilters = append(c.URLFilters, regexp.MustCompile(whiteLists))
    }

    whiteListDomain, _ := cmd.Flags().GetString("whitelist-domain")
    if whiteListDomain != "" {
        c.URLFilters = make([]*regexp.Regexp, 0)
        c.URLFilters = append(c.URLFilters, regexp.MustCompile("http(s)?://"+whiteListDomain))
    }
    // linkFinderCollector
    linkFinderCollector := c.Clone()
    // Try to request as much as Javascript source and don't care about domain.
    // The result of link finder will be send to Link Finder Collector to check is it working or not.
    linkFinderCollector.URLFilters = nil
    if whiteLists != "" {
        linkFinderCollector.URLFilters = append(linkFinderCollector.URLFilters, regexp.MustCompile(whiteLists))
    }
    if whiteListDomain != "" {
        linkFinderCollector.URLFilters = append(linkFinderCollector.URLFilters, regexp.MustCompile("http(s)?://"+whiteListDomain))
    }

    return &Crawler{
        cmd:                 cmd,
        C:                   c,
        LinkFinderCollector: linkFinderCollector,
        site:                site,
        Quiet:               quiet,
        Input:               site.String(),
        JsonOutput:          jsonOutput,
        length:              length,
        raw:                 raw,
        domain:              domain,
        Output:              output,
        urlSet:              stringset.NewStringFilter(),
        subSet:              stringset.NewStringFilter(),
        jsSet:               stringset.NewStringFilter(),
        formSet:             stringset.NewStringFilter(),
        awsSet:              stringset.NewStringFilter(),
        filterLength_slice:  filterLength_slice,
    }
}

再看下它是如果处理链接的:

主要是使用 colly 的 OnHTML、OnResponse 去完成了链接提取操作:

  1. href => Request.Visit 访问
  2. form[action] => 添加到 formSet
  3. input[type="file"] => 添加到 uploadFormSet
  4. [src] => js、json、xml 链接 => feedLinkfinder 进行链接提取操作
  5. 响应 => 提取 subdomain、aws_s3
// Start 初始化链接

func (crawler *Crawler) Start(linkfinder bool) {
    // Setup Link Finder 
    if linkfinder {
        // 链接提取模块
        crawler.setupLinkFinder()
    }

    // Handle url 从 href 中寻找链接
    crawler.C.OnHTML("[href]", func(e *colly.HTMLElement) {
        urlString := e.Request.AbsoluteURL(e.Attr("href"))
        // 从链接到完整的 URL
        urlString = FixUrl(crawler.site, urlString)
        if urlString == "" {
            return
        }
        // 没爬过的
        if !crawler.urlSet.Duplicate(urlString) {
            outputFormat := fmt.Sprintf("[href] - %s", urlString)
            if crawler.JsonOutput {
                sout := SpiderOutput{
                    Input:      crawler.Input,
                    Source:     "body",
                    OutputType: "form",
                    Output:     urlString,
                }
                if data, err := jsoniter.MarshalToString(sout); err == nil {
                    outputFormat = data
                    fmt.Println(outputFormat)
                }
            } else if !crawler.Quiet {
                fmt.Println(outputFormat)
            }
            if crawler.Output != nil {
                crawler.Output.WriteToFile(outputFormat)
            }
            // 添加到任务中
            _ = e.Request.Visit(urlString)
        }
    })

    // Handle form
    // 将 form 表单中的 url 放到 formSet 中 
    crawler.C.OnHTML("form[action]", func(e *colly.HTMLElement) {
        formUrl := e.Request.URL.String()
        if !crawler.formSet.Duplicate(formUrl) {
            outputFormat := fmt.Sprintf("[form] - %s", formUrl)
            if crawler.JsonOutput {
                sout := SpiderOutput{
                    Input:      crawler.Input,
                    Source:     "body",
                    OutputType: "form",
                    Output:     formUrl,
                }
                if data, err := jsoniter.MarshalToString(sout); err == nil {
                    outputFormat = data
                    fmt.Println(outputFormat)
                }
            } else if !crawler.Quiet {
                fmt.Println(outputFormat)
            }
            if crawler.Output != nil {
                crawler.Output.WriteToFile(outputFormat)
            }

        }
    })

    // Find Upload Form
    // 获取 文件上传类表单的 url
    uploadFormSet := stringset.NewStringFilter()
    crawler.C.OnHTML(`input[type="file"]`, func(e *colly.HTMLElement) {
        uploadUrl := e.Request.URL.String()
        if !uploadFormSet.Duplicate(uploadUrl) {
            outputFormat := fmt.Sprintf("[upload-form] - %s", uploadUrl)
            if crawler.JsonOutput {
                sout := SpiderOutput{
                    Input:      crawler.Input,
                    Source:     "body",
                    OutputType: "upload-form",
                    Output:     uploadUrl,
                }
                if data, err := jsoniter.MarshalToString(sout); err == nil {
                    outputFormat = data
                    fmt.Println(outputFormat)
                }
            } else if !crawler.Quiet {
                fmt.Println(outputFormat)
            }
            if crawler.Output != nil {
                crawler.Output.WriteToFile(outputFormat)
            }
        }

    })

    // Handle js files 收集 js xml json 链接 feedLinkfinder
    crawler.C.OnHTML("[src]", func(e *colly.HTMLElement) {
        jsFileUrl := e.Request.AbsoluteURL(e.Attr("src"))
        jsFileUrl = FixUrl(crawler.site, jsFileUrl)
        if jsFileUrl == "" {
            return
        }

        fileExt := GetExtType(jsFileUrl)
        if fileExt == ".js" || fileExt == ".xml" || fileExt == ".json" {
            crawler.feedLinkfinder(jsFileUrl, "javascript", "body")
        }
    })
    // 对于响应 => 从中寻找链接或者AWSS3
    crawler.C.OnResponse(func(response *colly.Response) {
        respStr := DecodeChars(string(response.Body))

        if len(crawler.filterLength_slice) == 0 || !contains(crawler.filterLength_slice, len(respStr)) {

            // Verify which link is working
            u := response.Request.URL.String()
            outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u)

            if crawler.length {
                outputFormat = fmt.Sprintf("[url] - [code-%d] - [len_%d] - %s", response.StatusCode, len(respStr), u)
            }

            if crawler.JsonOutput {
                sout := SpiderOutput{
                    Input:      crawler.Input,
                    Source:     "body",
                    OutputType: "url",
                    StatusCode: response.StatusCode,
                    Output:     u,
                    Length:     strings.Count(respStr, "\n"),
                }
                if data, err := jsoniter.MarshalToString(sout); err == nil {
                    outputFormat = data
                }
            } else if crawler.Quiet {
                outputFormat = u
            }
            fmt.Println(outputFormat)
            if crawler.Output != nil {
                crawler.Output.WriteToFile(outputFormat)
            }
            // 确定是目标的响应的话就收集一下子域名和 AWSS3 信息
            if InScope(response.Request.URL, crawler.C.URLFilters) {
                crawler.findSubdomains(respStr)
                crawler.findAWSS3(respStr)
            }

            if crawler.raw {
                outputFormat := fmt.Sprintf("[Raw] - \n%s\n", respStr) // PRINTCLEAN RAW for link visited only
                if !crawler.Quiet {
                    fmt.Println(outputFormat)
                }
                if crawler.Output != nil {
                    crawler.Output.WriteToFile(outputFormat)
                }
            }
        }
    })
    // 错误响应的输出
    crawler.C.OnError(func(response *colly.Response, err error) {
        Logger.Debugf("Error request: %s - Status code: %v - Error: %s", response.Request.URL.String(), response.StatusCode, err)
        /*
            1xx Informational
            2xx Success
            3xx Redirection
            4xx Client Error
            5xx Server Error
        */
        if response.StatusCode == 404 || response.StatusCode == 429 || response.StatusCode < 100 || response.StatusCode >= 500 {
            return
        }

        u := response.Request.URL.String()
        outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u)

        if crawler.JsonOutput {
            sout := SpiderOutput{
                Input:      crawler.Input,
                Source:     "body",
                OutputType: "url",
                StatusCode: response.StatusCode,
                Output:     u,
                Length:     strings.Count(DecodeChars(string(response.Body)), "\n"),
            }
            if data, err := jsoniter.MarshalToString(sout); err == nil {
                outputFormat = data
                fmt.Println(outputFormat)
            }
        } else if crawler.Quiet {
            fmt.Println(u)
        } else {
            fmt.Println(outputFormat)
        }

        if crawler.Output != nil {
            crawler.Output.WriteToFile(outputFormat)
        }
    })
    // 访问目标
    err := crawler.C.Visit(crawler.site.String())
    if err != nil {
        Logger.Errorf("Failed to start %s: %s", crawler.site.String(), err)
    }
}

先看一下 setupLinkFinder 是怎么实现链接提取的:

这里为 colly 添加了一个响应处理,主要工作:

  1. 子域名、S3
  2. 使用 LinkFinder 提取响应中的路径对路径进行拼接获取 URL
  3. 如果是 js 、json 的就还是交给 feedLinkfinder 处理
  4. 不是的话就访问这个拼接好的链接
func (crawler *Crawler) setupLinkFinder() {
    // 处理响应
    crawler.LinkFinderCollector.OnResponse(func(response *colly.Response) {
        if response.StatusCode == 404 || response.StatusCode == 429 || response.StatusCode < 100 {
            return
        }
        respStr := string(response.Body)
        // 响应长度过滤器
        if len(crawler.filterLength_slice) == 0 || !contains(crawler.filterLength_slice, len(respStr)) {

            // Verify which link is working 
            u := response.Request.URL.String()
            //....
            // 判断请求的 url 是否是匹配当前目标正则或者白名单
            if InScope(response.Request.URL, crawler.C.URLFilters) {
                // 提取子域名和 S3
                crawler.findSubdomains(respStr)
                crawler.findAWSS3(respStr)
                // 从响应中提取路径
                paths, err := LinkFinder(respStr)
                if err != nil {
                    Logger.Error(err)
                    return
                }

                currentPathURL, err := url.Parse(u)
                currentPathURLerr := false
                if err != nil {
                    currentPathURLerr = true
                }

                for _, relPath := range paths {
                    var outputFormat string
                    // output ...
                    rebuildURL := ""
                    // 使用当前的 URL 或者站点 URL 拼接路径获取 URL => ResolveReference ( 学到了这个 url 路径拼接 )
                    if !currentPathURLerr {
                        rebuildURL = FixUrl(currentPathURL, relPath)
                    } else {
                        rebuildURL = FixUrl(crawler.site, relPath)
                    }

                    if rebuildURL == "" {
                        continue
                    }

                    // Try to request JS path
                    // Try to generate URLs with main site
                    fileExt := GetExtType(rebuildURL)
                    if fileExt == ".js" || fileExt == ".xml" || fileExt == ".json" || fileExt == ".map" {
                        crawler.feedLinkfinder(rebuildURL, "linkfinder", "javascript")
                    } else if !crawler.urlSet.Duplicate(rebuildURL) {
                        if crawler.JsonOutput {
                            sout := SpiderOutput{
                                Input:      crawler.Input,
                                Source:     response.Request.URL.String(),
                                OutputType: "linkfinder",
                                Output:     rebuildURL,
                            }
                            if data, err := jsoniter.MarshalToString(sout); err == nil {
                                outputFormat = data
                            }
                        } else if !crawler.Quiet {
                            outputFormat = fmt.Sprintf("[linkfinder] - %s", rebuildURL)
                        }

                        fmt.Println(outputFormat)

                        if crawler.Output != nil {
                            crawler.Output.WriteToFile(outputFormat)
                        }
                        // 添加访问
                        _ = crawler.C.Visit(rebuildURL)
                    }

                    // Try to generate URLs with the site where Javascript file host in (must be in main or sub domain)
                    // 访问这个拼接好的路径 ~
                    urlWithJSHostIn := FixUrl(crawler.site, relPath)
                    if urlWithJSHostIn != "" {
                        fileExt := GetExtType(urlWithJSHostIn)
                        if fileExt == ".js" || fileExt == ".xml" || fileExt == ".json" || fileExt == ".map" {
                            crawler.feedLinkfinder(urlWithJSHostIn, "linkfinder", "javascript")
                        } else {
                            // 有没有爬过 这里就是直接去利用了 
                            if crawler.urlSet.Duplicate(urlWithJSHostIn) {
                                continue
                            } else {
                                // 添加访问该链接
                                _ = crawler.C.Visit(urlWithJSHostIn) //not print care for lost link
                            }
                        }

                    }

                }

            }
        }
    })
}

它这里的 crawler.urlSet.Duplicate(urlWithJSHostIn) 就是直接判断 这个 map 里面是否存在来判断是否爬取过了,请求去重这里没怎么做...

这里先看一下 LinkFinder 它用于从响应中提取路径:

var linkFinderRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`)

// LinkFinder 从响应中寻找路径信息
func LinkFinder(source string) ([]string, error) {
    var links []string
    // source = strings.ToLower(source)
    // 响应过大 先对响应进行处理 提高正则表达式的效率
    if len(source) > 1000000 {
        source = strings.ReplaceAll(source, ";", ";\r\n")
        source = strings.ReplaceAll(source, ",", ",\r\n")
    }
    source = DecodeChars(source)

    match := linkFinderRegex.FindAllStringSubmatch(source, -1)
    for _, m := range match {
        // 使用 regexp.MustCompile(`[\t\r\n]+`).ReplaceAllString 替换掉换行空格等
        matchGroup1 := FilterNewLines(m[1])
        if matchGroup1 == "" {
            continue
        }
        links = append(links, matchGroup1)
    }
    // 链接验重 判断之前是否爬过了 map[url][bool] 
    links = Unique(links)
    return links, nil
}

这个先加换行再进行正则 GPT 这里给出的好处:

image-20240323013127911

然后看下这个 FixUrl 它是用于拼接 url 和爬到的路径的:

它使用了 ResolveReference 进行这个操作,它会自动处理绝对路径、相对路径,如果路径是 url 的话就不做拼接直接返回这个 url ,挺方便的。

1
2
3
4
5
6
7
func FixUrl(mainSite *url.URL, nextLoc string) string {
    nextLocUrl, err := url.Parse(nextLoc)
    if err != nil {
        return ""
    }
    return mainSite.ResolveReference(nextLocUrl).String()
}

然后就是 feedLinkfinder 对于 js 这种文件的操作:

func (crawler *Crawler) feedLinkfinder(jsFileUrl string, OutputType string, source string) {
    // 判断是否爬过这个 JS
    if !crawler.jsSet.Duplicate(jsFileUrl) {
        // ....

        // Send Javascript to Link Finder Collector
        // 直接去访问 就行了 提取还是 setupLinkFinder
        _ = crawler.LinkFinderCollector.Visit(jsFileUrl)

    }
}

它还对每个请求都检测了子域名和 s3 :

这里也是通过正则去提取的,这里其实可以看到俩函数的结构是相同的,可以把这些正则放到配置文件,然后之后去遍历这样子,方便后续添加新的规则:

// SUBRE 子域名的正则表达式
const SUBRE = `(?i)(([a-zA-Z0-9]{1}|[_a-zA-Z0-9]{1}[_a-zA-Z0-9-]{0,61}[a-zA-Z0-9]{1})[.]{1})+`

var AWSS3 = regexp.MustCompile(`(?i)[a-z0-9.-]+\.s3\.amazonaws\.com|[a-z0-9.-]+\.s3-[a-z0-9-]\.amazonaws\.com|[a-z0-9.-]+\.s3-website[.-](eu|ap|us|ca|sa|cn)|//s3\.amazonaws\.com/[a-z0-9._-]+|//s3-[a-z0-9-]+\.amazonaws\.com/[a-z0-9._-]+`)

// SubdomainRegex returns a Regexp object initialized to match
// subdomain names that end with the domain provided by the parameter.
// 通过根域名生成对应子域名的正则表达式
func subdomainRegex(domain string) *regexp.Regexp {
    // Change all the periods into literal periods for the regex
    // . 修改为正则的点 [.]
    d := strings.Replace(domain, ".", "[.]", -1)
    return regexp.MustCompile(SUBRE + d)
}

// GetSubdomains 从数据源 ( 响应信息 ) 中获取子域名
func GetSubdomains(source, domain string) []string {
    var subs []string
    re := subdomainRegex(domain)
    for _, match := range re.FindAllStringSubmatch(source, -1) {
        subs = append(subs, CleanSubdomain(match[0]))
    }
    return subs
}

// GetAWSS3 AWS_S3 存储桶的 URL
func GetAWSS3(source string) []string {
    var aws []string
    for _, match := range AWSS3.FindAllStringSubmatch(source, -1) {
        aws = append(aws, DecodeChars(match[0]))
    }
    return aws
}

robots 中提取:

// ParseRobots 从 robots 中寻找 url
func ParseRobots(site *url.URL, crawler *Crawler, c *colly.Collector, wg *sync.WaitGroup) {
    defer wg.Done()
    robotsURL := site.String() + "/robots.txt"

    resp, err := http.Get(robotsURL)
    if err != nil {
        return
    }
    if resp.StatusCode == 200 {
        Logger.Infof("Found robots.txt: %s", robotsURL)
        body, err := ioutil.ReadAll(resp.Body)
        if err != nil {
            return
        }
        lines := strings.Split(string(body), "\n")
        // 这里的正则匹配 Disallow: Allow: 部分
        var re = regexp.MustCompile(".*llow: ")
        for _, line := range lines {
            if strings.Contains(line, "llow: ") {
                // 移除 Dis... 部分 ( 正则 )
                url := re.ReplaceAllString(line, "")
                url = FixUrl(site, url)
                if url == "" {
                    continue
                }
                outputFormat := fmt.Sprintf("[robots] - %s", url)

                if crawler.JsonOutput {
                    sout := SpiderOutput{
                        Input:      crawler.Input,
                        Source:     "robots",
                        OutputType: "url",
                        Output:     url,
                    }
                    if data, err := jsoniter.MarshalToString(sout); err == nil {
                        outputFormat = data
                    }
                } else if crawler.Quiet {
                    outputFormat = url
                }
                fmt.Println(outputFormat)
                if crawler.Output != nil {
                    crawler.Output.WriteToFile(outputFormat)
                }
                _ = c.Visit(url)
            }
        }
    }
}

站点地图,使用 gopher-parse-sitemap 库进行的 sitemap 解析:

func ParseSiteMap(site *url.URL, crawler *Crawler, c *colly.Collector, wg *sync.WaitGroup) {
    defer wg.Done()
    sitemapUrls := []string{"/sitemap.xml", "/sitemap_news.xml", "/sitemap_index.xml", "/sitemap-index.xml", "/sitemapindex.xml",
        "/sitemap-news.xml", "/post-sitemap.xml", "/page-sitemap.xml", "/portfolio-sitemap.xml", "/home_slider-sitemap.xml", "/category-sitemap.xml",
        "/author-sitemap.xml"}

    for _, path := range sitemapUrls {
        // Ignore error when that not valid sitemap.xml path
        Logger.Infof("Trying to find %s", site.String()+path)
        _ = sitemap.ParseFromSite(site.String()+path, func(entry sitemap.Entry) error {
            outputFormat := fmt.Sprintf("[sitemap] - %s", entry.GetLocation())

            if crawler.JsonOutput {
                sout := SpiderOutput{
                    Input:      crawler.Input,
                    Source:     "sitemap",
                    OutputType: "url",
                    Output:     entry.GetLocation(),
                }
                if data, err := jsoniter.MarshalToString(sout); err == nil {
                    outputFormat = data
                }
            } else if crawler.Quiet {
                outputFormat = entry.GetLocation()
            }
            fmt.Println(outputFormat)
            if crawler.Output != nil {
                crawler.Output.WriteToFile(outputFormat)
            }
            _ = c.Visit(entry.GetLocation())
            return nil
        })
    }

}

这里还有一些第三方的 url 收集,就是 Wayback Machine、Common Crawl、Virus Total、Alien Vault 这些:

image-20240323033408855

func getWaybackURLs(domain string, noSubs bool) ([]wurl, error) {
    subsWildcard := "*."
    if noSubs {
        subsWildcard = ""
    }
    // 从一个网站存档中获取响应
    res, err := http.Get(
        fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s%s/*&output=json&collapse=urlkey", subsWildcard, domain),
    )
    if err != nil {
        return []wurl{}, err
    }

    raw, err := ioutil.ReadAll(res.Body)

    res.Body.Close()
    if err != nil {
        return []wurl{}, err
    }

    var wrapper [][]string
    err = json.Unmarshal(raw, &wrapper)

    out := make([]wurl, 0, len(wrapper))

    skip := true
    for _, urls := range wrapper {
        // The first item is always just the string "original",
        // so we should skip the first item
        if skip {
            skip = false
            continue
        }
        out = append(out, wurl{date: urls[1], url: urls[2]})
    }

    return out, nil

}

学习总结

  1. 学到了 colly 爬虫框架的使用
  2. http.ProxyURL(pU)、ResolveReference

不过还有一些地方可以再优化一下,比如请求队列去重,GoSpider 并没有做这个操作,还有对于 aws-s3 这些响应中敏感信息的提取可以去写一个配置文件来实现这个操作,有助于后期增添新的规则,动态爬虫也可以实现这一操作。

看到网上使用 GoSpider 后会使用 uro 进行链接去重,后面可以看看。