func NewCrawler(site *url.URL, cmd *cobra.Command) *Crawler {
// 获取根域名
domain := GetDomain(site)
if domain == "" {
Logger.Error("Failed to parse domain")
os.Exit(1)
}
Logger.Infof("Start crawling: %s", site)
quiet, _ := cmd.Flags().GetBool("quiet")
jsonOutput, _ := cmd.Flags().GetBool("json")
maxDepth, _ := cmd.Flags().GetInt("depth")
concurrent, _ := cmd.Flags().GetInt("concurrent")
delay, _ := cmd.Flags().GetInt("delay")
randomDelay, _ := cmd.Flags().GetInt("random-delay")
length, _ := cmd.Flags().GetBool("length")
raw, _ := cmd.Flags().GetBool("raw")
subs, _ := cmd.Flags().GetBool("subs")
// 使用 colly 爬虫
c := colly.NewCollector(
colly.Async(true), // 异步
colly.MaxDepth(maxDepth), // 深度
colly.IgnoreRobotsTxt(), // 忽略 robots
)
// Setup http client
client := &http.Client{}
// Set proxy
proxy, _ := cmd.Flags().GetString("proxy")
if proxy != "" {
Logger.Infof("Proxy: %s", proxy)
pU, err := url.Parse(proxy)
if err != nil {
Logger.Error("Failed to set proxy")
} else {
// 设置代理 http.ProxyURL 这个东西是我之前不知道的 学到了
DefaultHTTPTransport.Proxy = http.ProxyURL(pU)
}
}
// Set request timeout
timeout, _ := cmd.Flags().GetInt("timeout")
if timeout == 0 {
Logger.Info("Your input timeout is 0. Gospider will set it to 10 seconds")
client.Timeout = 10 * time.Second
} else {
client.Timeout = time.Duration(timeout) * time.Second
}
// Disable redirect
noRedirect, _ := cmd.Flags().GetBool("no-redirect")
// 处理重定向
if noRedirect {
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
nextLocation := req.Response.Header.Get("Location")
Logger.Debugf("Found Redirect: %s", nextLocation)
// Allow in redirect from http to https or in same hostname
// We just check contain hostname or not because we set URLFilter in main collector so if
// the URL is https://otherdomain.com/?url=maindomain.com, it will reject it
if strings.Contains(nextLocation, site.Hostname()) {
Logger.Infof("Redirecting to: %s", nextLocation)
return nil
}
return http.ErrUseLastResponse
}
}
// Set client transport
client.Transport = DefaultHTTPTransport
c.SetClient(client)
// Get headers here to overwrite if "burp" flag used
// 从 burp 的请求报文中获取 Cookie 和 请求头 设置到 colly 请求中
burpFile, _ := cmd.Flags().GetString("burp")
if burpFile != "" {
bF, err := os.Open(burpFile)
if err != nil {
Logger.Errorf("Failed to open Burp File: %s", err)
} else {
rd := bufio.NewReader(bF)
req, err := http.ReadRequest(rd)
if err != nil {
Logger.Errorf("Failed to Parse Raw Request in %s: %s", burpFile, err)
} else {
// Set cookie
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", GetRawCookie(req.Cookies()))
})
// Set headers
c.OnRequest(func(r *colly.Request) {
for k, v := range req.Header {
r.Headers.Set(strings.TrimSpace(k), strings.TrimSpace(v[0]))
}
})
}
}
}
// Set cookies
cookie, _ := cmd.Flags().GetString("cookie")
if cookie != "" && burpFile == "" {
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", cookie)
})
}
// Set headers
headers, _ := cmd.Flags().GetStringArray("header")
if burpFile == "" {
for _, h := range headers {
headerArgs := strings.SplitN(h, ":", 2)
headerKey := strings.TrimSpace(headerArgs[0])
headerValue := strings.TrimSpace(headerArgs[1])
c.OnRequest(func(r *colly.Request) {
r.Headers.Set(headerKey, headerValue)
})
}
}
// Set User-Agent 使用 colly 提供的 UA 头
randomUA, _ := cmd.Flags().GetString("user-agent")
switch ua := strings.ToLower(randomUA); {
case ua == "mobi":
extensions.RandomMobileUserAgent(c)
case ua == "web":
extensions.RandomUserAgent(c)
default:
c.UserAgent = ua
}
// Set referer 添加 Referer
extensions.Referer(c)
// Init Output
var output *Output
outputFolder, _ := cmd.Flags().GetString("output")
if outputFolder != "" {
filename := strings.ReplaceAll(site.Hostname(), ".", "_")
output = NewOutput(outputFolder, filename)
}
// Init Length Filter
filterLength_slice := []int{}
filterLength, _ := cmd.Flags().GetString("filter-length")
// 长度过滤
if filterLength != "" {
lengthArgs := strings.Split(filterLength, ",")
for i := 0; i < len(lengthArgs); i++ {
if i, err := strconv.Atoi(lengthArgs[i]); err == nil {
filterLength_slice = append(filterLength_slice, i)
}
}
}
// Set url whitelist regex
reg := ""
if subs {
reg = site.Hostname()
} else {
reg = "(?:https|http)://" + site.Hostname()
}
sRegex := regexp.MustCompile(reg)
c.URLFilters = append(c.URLFilters, sRegex)
// Set Limit Rule
err := c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: concurrent,
Delay: time.Duration(delay) * time.Second,
RandomDelay: time.Duration(randomDelay) * time.Second,
})
if err != nil {
Logger.Errorf("Failed to set Limit Rule: %s", err)
os.Exit(1)
}
// GoSpider default disallowed regex
// 不爬的东西
disallowedRegex := `(?i)\.(png|apng|bmp|gif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`
c.DisallowedURLFilters = append(c.DisallowedURLFilters, regexp.MustCompile(disallowedRegex))
// Set optional blacklist url regex
blacklists, _ := cmd.Flags().GetString("blacklist")
if blacklists != "" {
c.DisallowedURLFilters = append(c.DisallowedURLFilters, regexp.MustCompile(blacklists))
}
// Set optional whitelist url regex
// 白名单
whiteLists, _ := cmd.Flags().GetString("whitelist")
if whiteLists != "" {
c.URLFilters = make([]*regexp.Regexp, 0)
c.URLFilters = append(c.URLFilters, regexp.MustCompile(whiteLists))
}
whiteListDomain, _ := cmd.Flags().GetString("whitelist-domain")
if whiteListDomain != "" {
c.URLFilters = make([]*regexp.Regexp, 0)
c.URLFilters = append(c.URLFilters, regexp.MustCompile("http(s)?://"+whiteListDomain))
}
// linkFinderCollector
linkFinderCollector := c.Clone()
// Try to request as much as Javascript source and don't care about domain.
// The result of link finder will be send to Link Finder Collector to check is it working or not.
linkFinderCollector.URLFilters = nil
if whiteLists != "" {
linkFinderCollector.URLFilters = append(linkFinderCollector.URLFilters, regexp.MustCompile(whiteLists))
}
if whiteListDomain != "" {
linkFinderCollector.URLFilters = append(linkFinderCollector.URLFilters, regexp.MustCompile("http(s)?://"+whiteListDomain))
}
return &Crawler{
cmd: cmd,
C: c,
LinkFinderCollector: linkFinderCollector,
site: site,
Quiet: quiet,
Input: site.String(),
JsonOutput: jsonOutput,
length: length,
raw: raw,
domain: domain,
Output: output,
urlSet: stringset.NewStringFilter(),
subSet: stringset.NewStringFilter(),
jsSet: stringset.NewStringFilter(),
formSet: stringset.NewStringFilter(),
awsSet: stringset.NewStringFilter(),
filterLength_slice: filterLength_slice,
}
}