GoSpider 源码学习

项目介绍

项目地址:https://github.com/jaeles-project/gospider

goSpider 是一款静态爬虫,之前读了 crawlergo 动态爬虫,现在来看一看静态爬虫。

介绍如下:

image-20240323004407171

url:

  • sitemap 站点地图
  • robots.txt
  • Wayback Machine、Common Crawl、Virus Total、Alien Vault 从外部获取 url 和 lc/gau 差不多
  • 响应中提取

响应:

  • subdomains
  • aws-s3

项目结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
├── core
│   ├── crawler.go # 爬虫
│   ├── grep.go # 提取子域名、S3
│   ├── linkfinder.go # 从响应中寻找路径
│   ├── logger.go # 日志
│   ├── othersource.go # 其他源 ( 外部获取 )
│   ├── output.go # 输出
│   ├── robots.go # 从 robots.txt 中提取
│   ├── sitemap.go # 从站点地图中提取
│   ├── utils.go # 工具
│   └── version.go
├── Dockerfile
├── main.go
├── README.md
└── stringset
├── filter.go # 链接判重
└── set.go

源码学习

crawler.go 是爬行的具体实现,先看一下 NewCrawler:

这里可以看到它使用了 colly 爬虫框架去完成整个爬虫的初始化操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
func NewCrawler(site *url.URL, cmd *cobra.Command) *Crawler {
// 获取根域名
domain := GetDomain(site)
if domain == "" {
Logger.Error("Failed to parse domain")
os.Exit(1)
}
Logger.Infof("Start crawling: %s", site)

quiet, _ := cmd.Flags().GetBool("quiet")
jsonOutput, _ := cmd.Flags().GetBool("json")
maxDepth, _ := cmd.Flags().GetInt("depth")
concurrent, _ := cmd.Flags().GetInt("concurrent")
delay, _ := cmd.Flags().GetInt("delay")
randomDelay, _ := cmd.Flags().GetInt("random-delay")
length, _ := cmd.Flags().GetBool("length")
raw, _ := cmd.Flags().GetBool("raw")
subs, _ := cmd.Flags().GetBool("subs")
// 使用 colly 爬虫
c := colly.NewCollector(
colly.Async(true), // 异步
colly.MaxDepth(maxDepth), // 深度
colly.IgnoreRobotsTxt(), // 忽略 robots
)

// Setup http client
client := &http.Client{}

// Set proxy
proxy, _ := cmd.Flags().GetString("proxy")
if proxy != "" {
Logger.Infof("Proxy: %s", proxy)
pU, err := url.Parse(proxy)
if err != nil {
Logger.Error("Failed to set proxy")
} else {
// 设置代理 http.ProxyURL 这个东西是我之前不知道的 学到了
DefaultHTTPTransport.Proxy = http.ProxyURL(pU)
}
}

// Set request timeout
timeout, _ := cmd.Flags().GetInt("timeout")
if timeout == 0 {
Logger.Info("Your input timeout is 0. Gospider will set it to 10 seconds")
client.Timeout = 10 * time.Second
} else {
client.Timeout = time.Duration(timeout) * time.Second
}

// Disable redirect
noRedirect, _ := cmd.Flags().GetBool("no-redirect")
// 处理重定向
if noRedirect {
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
nextLocation := req.Response.Header.Get("Location")
Logger.Debugf("Found Redirect: %s", nextLocation)
// Allow in redirect from http to https or in same hostname
// We just check contain hostname or not because we set URLFilter in main collector so if
// the URL is https://otherdomain.com/?url=maindomain.com, it will reject it
if strings.Contains(nextLocation, site.Hostname()) {
Logger.Infof("Redirecting to: %s", nextLocation)
return nil
}
return http.ErrUseLastResponse
}
}

// Set client transport
client.Transport = DefaultHTTPTransport
c.SetClient(client)

// Get headers here to overwrite if "burp" flag used
// 从 burp 的请求报文中获取 Cookie 和 请求头 设置到 colly 请求中
burpFile, _ := cmd.Flags().GetString("burp")
if burpFile != "" {
bF, err := os.Open(burpFile)
if err != nil {
Logger.Errorf("Failed to open Burp File: %s", err)
} else {
rd := bufio.NewReader(bF)
req, err := http.ReadRequest(rd)
if err != nil {
Logger.Errorf("Failed to Parse Raw Request in %s: %s", burpFile, err)
} else {
// Set cookie
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", GetRawCookie(req.Cookies()))
})
// Set headers
c.OnRequest(func(r *colly.Request) {
for k, v := range req.Header {
r.Headers.Set(strings.TrimSpace(k), strings.TrimSpace(v[0]))
}
})

}
}
}

// Set cookies
cookie, _ := cmd.Flags().GetString("cookie")
if cookie != "" && burpFile == "" {
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", cookie)
})
}

// Set headers
headers, _ := cmd.Flags().GetStringArray("header")
if burpFile == "" {
for _, h := range headers {
headerArgs := strings.SplitN(h, ":", 2)
headerKey := strings.TrimSpace(headerArgs[0])
headerValue := strings.TrimSpace(headerArgs[1])
c.OnRequest(func(r *colly.Request) {
r.Headers.Set(headerKey, headerValue)
})
}
}

// Set User-Agent 使用 colly 提供的 UA 头
randomUA, _ := cmd.Flags().GetString("user-agent")
switch ua := strings.ToLower(randomUA); {
case ua == "mobi":
extensions.RandomMobileUserAgent(c)
case ua == "web":
extensions.RandomUserAgent(c)
default:
c.UserAgent = ua
}

// Set referer 添加 Referer
extensions.Referer(c)

// Init Output
var output *Output
outputFolder, _ := cmd.Flags().GetString("output")
if outputFolder != "" {
filename := strings.ReplaceAll(site.Hostname(), ".", "_")
output = NewOutput(outputFolder, filename)
}

// Init Length Filter
filterLength_slice := []int{}
filterLength, _ := cmd.Flags().GetString("filter-length")
// 长度过滤
if filterLength != "" {
lengthArgs := strings.Split(filterLength, ",")
for i := 0; i < len(lengthArgs); i++ {
if i, err := strconv.Atoi(lengthArgs[i]); err == nil {
filterLength_slice = append(filterLength_slice, i)
}
}
}

// Set url whitelist regex
reg := ""
if subs {
reg = site.Hostname()
} else {
reg = "(?:https|http)://" + site.Hostname()
}

sRegex := regexp.MustCompile(reg)
c.URLFilters = append(c.URLFilters, sRegex)

// Set Limit Rule
err := c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: concurrent,
Delay: time.Duration(delay) * time.Second,
RandomDelay: time.Duration(randomDelay) * time.Second,
})
if err != nil {
Logger.Errorf("Failed to set Limit Rule: %s", err)
os.Exit(1)
}

// GoSpider default disallowed regex
// 不爬的东西
disallowedRegex := `(?i)\.(png|apng|bmp|gif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`
c.DisallowedURLFilters = append(c.DisallowedURLFilters, regexp.MustCompile(disallowedRegex))

// Set optional blacklist url regex
blacklists, _ := cmd.Flags().GetString("blacklist")
if blacklists != "" {
c.DisallowedURLFilters = append(c.DisallowedURLFilters, regexp.MustCompile(blacklists))
}

// Set optional whitelist url regex
// 白名单
whiteLists, _ := cmd.Flags().GetString("whitelist")
if whiteLists != "" {
c.URLFilters = make([]*regexp.Regexp, 0)
c.URLFilters = append(c.URLFilters, regexp.MustCompile(whiteLists))
}

whiteListDomain, _ := cmd.Flags().GetString("whitelist-domain")
if whiteListDomain != "" {
c.URLFilters = make([]*regexp.Regexp, 0)
c.URLFilters = append(c.URLFilters, regexp.MustCompile("http(s)?://"+whiteListDomain))
}
// linkFinderCollector
linkFinderCollector := c.Clone()
// Try to request as much as Javascript source and don't care about domain.
// The result of link finder will be send to Link Finder Collector to check is it working or not.
linkFinderCollector.URLFilters = nil
if whiteLists != "" {
linkFinderCollector.URLFilters = append(linkFinderCollector.URLFilters, regexp.MustCompile(whiteLists))
}
if whiteListDomain != "" {
linkFinderCollector.URLFilters = append(linkFinderCollector.URLFilters, regexp.MustCompile("http(s)?://"+whiteListDomain))
}

return &Crawler{
cmd: cmd,
C: c,
LinkFinderCollector: linkFinderCollector,
site: site,
Quiet: quiet,
Input: site.String(),
JsonOutput: jsonOutput,
length: length,
raw: raw,
domain: domain,
Output: output,
urlSet: stringset.NewStringFilter(),
subSet: stringset.NewStringFilter(),
jsSet: stringset.NewStringFilter(),
formSet: stringset.NewStringFilter(),
awsSet: stringset.NewStringFilter(),
filterLength_slice: filterLength_slice,
}
}

再看下它是如果处理链接的:

主要是使用 colly 的 OnHTML、OnResponse 去完成了链接提取操作:

  1. href => Request.Visit 访问
  2. form[action] => 添加到 formSet
  3. input[type=”file”] => 添加到 uploadFormSet
  4. [src] => js、json、xml 链接 => feedLinkfinder 进行链接提取操作
  5. 响应 => 提取 subdomain、aws_s3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// Start 初始化链接

func (crawler *Crawler) Start(linkfinder bool) {
// Setup Link Finder
if linkfinder {
// 链接提取模块
crawler.setupLinkFinder()
}

// Handle url 从 href 中寻找链接
crawler.C.OnHTML("[href]", func(e *colly.HTMLElement) {
urlString := e.Request.AbsoluteURL(e.Attr("href"))
// 从链接到完整的 URL
urlString = FixUrl(crawler.site, urlString)
if urlString == "" {
return
}
// 没爬过的
if !crawler.urlSet.Duplicate(urlString) {
outputFormat := fmt.Sprintf("[href] - %s", urlString)
if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "body",
OutputType: "form",
Output: urlString,
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
fmt.Println(outputFormat)
}
} else if !crawler.Quiet {
fmt.Println(outputFormat)
}
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
// 添加到任务中
_ = e.Request.Visit(urlString)
}
})

// Handle form
// 将 form 表单中的 url 放到 formSet 中
crawler.C.OnHTML("form[action]", func(e *colly.HTMLElement) {
formUrl := e.Request.URL.String()
if !crawler.formSet.Duplicate(formUrl) {
outputFormat := fmt.Sprintf("[form] - %s", formUrl)
if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "body",
OutputType: "form",
Output: formUrl,
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
fmt.Println(outputFormat)
}
} else if !crawler.Quiet {
fmt.Println(outputFormat)
}
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}

}
})

// Find Upload Form
// 获取 文件上传类表单的 url
uploadFormSet := stringset.NewStringFilter()
crawler.C.OnHTML(`input[type="file"]`, func(e *colly.HTMLElement) {
uploadUrl := e.Request.URL.String()
if !uploadFormSet.Duplicate(uploadUrl) {
outputFormat := fmt.Sprintf("[upload-form] - %s", uploadUrl)
if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "body",
OutputType: "upload-form",
Output: uploadUrl,
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
fmt.Println(outputFormat)
}
} else if !crawler.Quiet {
fmt.Println(outputFormat)
}
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
}

})

// Handle js files 收集 js xml json 链接 feedLinkfinder
crawler.C.OnHTML("[src]", func(e *colly.HTMLElement) {
jsFileUrl := e.Request.AbsoluteURL(e.Attr("src"))
jsFileUrl = FixUrl(crawler.site, jsFileUrl)
if jsFileUrl == "" {
return
}

fileExt := GetExtType(jsFileUrl)
if fileExt == ".js" || fileExt == ".xml" || fileExt == ".json" {
crawler.feedLinkfinder(jsFileUrl, "javascript", "body")
}
})
// 对于响应 => 从中寻找链接或者AWSS3
crawler.C.OnResponse(func(response *colly.Response) {
respStr := DecodeChars(string(response.Body))

if len(crawler.filterLength_slice) == 0 || !contains(crawler.filterLength_slice, len(respStr)) {

// Verify which link is working
u := response.Request.URL.String()
outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u)

if crawler.length {
outputFormat = fmt.Sprintf("[url] - [code-%d] - [len_%d] - %s", response.StatusCode, len(respStr), u)
}

if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "body",
OutputType: "url",
StatusCode: response.StatusCode,
Output: u,
Length: strings.Count(respStr, "\n"),
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
}
} else if crawler.Quiet {
outputFormat = u
}
fmt.Println(outputFormat)
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
// 确定是目标的响应的话就收集一下子域名和 AWSS3 信息
if InScope(response.Request.URL, crawler.C.URLFilters) {
crawler.findSubdomains(respStr)
crawler.findAWSS3(respStr)
}

if crawler.raw {
outputFormat := fmt.Sprintf("[Raw] - \n%s\n", respStr) // PRINTCLEAN RAW for link visited only
if !crawler.Quiet {
fmt.Println(outputFormat)
}
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
}
}
})
// 错误响应的输出
crawler.C.OnError(func(response *colly.Response, err error) {
Logger.Debugf("Error request: %s - Status code: %v - Error: %s", response.Request.URL.String(), response.StatusCode, err)
/*
1xx Informational
2xx Success
3xx Redirection
4xx Client Error
5xx Server Error
*/
if response.StatusCode == 404 || response.StatusCode == 429 || response.StatusCode < 100 || response.StatusCode >= 500 {
return
}

u := response.Request.URL.String()
outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u)

if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "body",
OutputType: "url",
StatusCode: response.StatusCode,
Output: u,
Length: strings.Count(DecodeChars(string(response.Body)), "\n"),
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
fmt.Println(outputFormat)
}
} else if crawler.Quiet {
fmt.Println(u)
} else {
fmt.Println(outputFormat)
}

if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
})
// 访问目标
err := crawler.C.Visit(crawler.site.String())
if err != nil {
Logger.Errorf("Failed to start %s: %s", crawler.site.String(), err)
}
}

先看一下 setupLinkFinder 是怎么实现链接提取的:

这里为 colly 添加了一个响应处理,主要工作:

  1. 子域名、S3
  2. 使用 LinkFinder 提取响应中的路径对路径进行拼接获取 URL
  3. 如果是 js 、json 的就还是交给 feedLinkfinder 处理
  4. 不是的话就访问这个拼接好的链接
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
func (crawler *Crawler) setupLinkFinder() {
// 处理响应
crawler.LinkFinderCollector.OnResponse(func(response *colly.Response) {
if response.StatusCode == 404 || response.StatusCode == 429 || response.StatusCode < 100 {
return
}
respStr := string(response.Body)
// 响应长度过滤器
if len(crawler.filterLength_slice) == 0 || !contains(crawler.filterLength_slice, len(respStr)) {

// Verify which link is working
u := response.Request.URL.String()
//....
// 判断请求的 url 是否是匹配当前目标正则或者白名单
if InScope(response.Request.URL, crawler.C.URLFilters) {
// 提取子域名和 S3
crawler.findSubdomains(respStr)
crawler.findAWSS3(respStr)
// 从响应中提取路径
paths, err := LinkFinder(respStr)
if err != nil {
Logger.Error(err)
return
}

currentPathURL, err := url.Parse(u)
currentPathURLerr := false
if err != nil {
currentPathURLerr = true
}

for _, relPath := range paths {
var outputFormat string
// output ...
rebuildURL := ""
// 使用当前的 URL 或者站点 URL 拼接路径获取 URL => ResolveReference ( 学到了这个 url 路径拼接 )
if !currentPathURLerr {
rebuildURL = FixUrl(currentPathURL, relPath)
} else {
rebuildURL = FixUrl(crawler.site, relPath)
}

if rebuildURL == "" {
continue
}

// Try to request JS path
// Try to generate URLs with main site
fileExt := GetExtType(rebuildURL)
if fileExt == ".js" || fileExt == ".xml" || fileExt == ".json" || fileExt == ".map" {
crawler.feedLinkfinder(rebuildURL, "linkfinder", "javascript")
} else if !crawler.urlSet.Duplicate(rebuildURL) {
if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: response.Request.URL.String(),
OutputType: "linkfinder",
Output: rebuildURL,
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
}
} else if !crawler.Quiet {
outputFormat = fmt.Sprintf("[linkfinder] - %s", rebuildURL)
}

fmt.Println(outputFormat)

if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
// 添加访问
_ = crawler.C.Visit(rebuildURL)
}

// Try to generate URLs with the site where Javascript file host in (must be in main or sub domain)
// 访问这个拼接好的路径 ~
urlWithJSHostIn := FixUrl(crawler.site, relPath)
if urlWithJSHostIn != "" {
fileExt := GetExtType(urlWithJSHostIn)
if fileExt == ".js" || fileExt == ".xml" || fileExt == ".json" || fileExt == ".map" {
crawler.feedLinkfinder(urlWithJSHostIn, "linkfinder", "javascript")
} else {
// 有没有爬过 这里就是直接去利用了
if crawler.urlSet.Duplicate(urlWithJSHostIn) {
continue
} else {
// 添加访问该链接
_ = crawler.C.Visit(urlWithJSHostIn) //not print care for lost link
}
}

}

}

}
}
})
}

它这里的 crawler.urlSet.Duplicate(urlWithJSHostIn) 就是直接判断 这个 map 里面是否存在来判断是否爬取过了,请求去重这里没怎么做…

这里先看一下 LinkFinder 它用于从响应中提取路径:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
var linkFinderRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`)

// LinkFinder 从响应中寻找路径信息
func LinkFinder(source string) ([]string, error) {
var links []string
// source = strings.ToLower(source)
// 响应过大 先对响应进行处理 提高正则表达式的效率
if len(source) > 1000000 {
source = strings.ReplaceAll(source, ";", ";\r\n")
source = strings.ReplaceAll(source, ",", ",\r\n")
}
source = DecodeChars(source)

match := linkFinderRegex.FindAllStringSubmatch(source, -1)
for _, m := range match {
// 使用 regexp.MustCompile(`[\t\r\n]+`).ReplaceAllString 替换掉换行空格等
matchGroup1 := FilterNewLines(m[1])
if matchGroup1 == "" {
continue
}
links = append(links, matchGroup1)
}
// 链接验重 判断之前是否爬过了 map[url][bool]
links = Unique(links)
return links, nil
}

这个先加换行再进行正则 GPT 这里给出的好处:

image-20240323013127911

然后看下这个 FixUrl 它是用于拼接 url 和爬到的路径的:

它使用了 ResolveReference 进行这个操作,它会自动处理绝对路径、相对路径,如果路径是 url 的话就不做拼接直接返回这个 url ,挺方便的。

1
2
3
4
5
6
7
func FixUrl(mainSite *url.URL, nextLoc string) string {
nextLocUrl, err := url.Parse(nextLoc)
if err != nil {
return ""
}
return mainSite.ResolveReference(nextLocUrl).String()
}

然后就是 feedLinkfinder 对于 js 这种文件的操作:

1
2
3
4
5
6
7
8
9
10
11
func (crawler *Crawler) feedLinkfinder(jsFileUrl string, OutputType string, source string) {
// 判断是否爬过这个 JS
if !crawler.jsSet.Duplicate(jsFileUrl) {
// ....

// Send Javascript to Link Finder Collector
// 直接去访问 就行了 提取还是 setupLinkFinder
_ = crawler.LinkFinderCollector.Visit(jsFileUrl)

}
}

它还对每个请求都检测了子域名和 s3 :

这里也是通过正则去提取的,这里其实可以看到俩函数的结构是相同的,可以把这些正则放到配置文件,然后之后去遍历这样子,方便后续添加新的规则:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// SUBRE 子域名的正则表达式
const SUBRE = `(?i)(([a-zA-Z0-9]{1}|[_a-zA-Z0-9]{1}[_a-zA-Z0-9-]{0,61}[a-zA-Z0-9]{1})[.]{1})+`

var AWSS3 = regexp.MustCompile(`(?i)[a-z0-9.-]+\.s3\.amazonaws\.com|[a-z0-9.-]+\.s3-[a-z0-9-]\.amazonaws\.com|[a-z0-9.-]+\.s3-website[.-](eu|ap|us|ca|sa|cn)|//s3\.amazonaws\.com/[a-z0-9._-]+|//s3-[a-z0-9-]+\.amazonaws\.com/[a-z0-9._-]+`)

// SubdomainRegex returns a Regexp object initialized to match
// subdomain names that end with the domain provided by the parameter.
// 通过根域名生成对应子域名的正则表达式
func subdomainRegex(domain string) *regexp.Regexp {
// Change all the periods into literal periods for the regex
// . 修改为正则的点 [.]
d := strings.Replace(domain, ".", "[.]", -1)
return regexp.MustCompile(SUBRE + d)
}

// GetSubdomains 从数据源 ( 响应信息 ) 中获取子域名
func GetSubdomains(source, domain string) []string {
var subs []string
re := subdomainRegex(domain)
for _, match := range re.FindAllStringSubmatch(source, -1) {
subs = append(subs, CleanSubdomain(match[0]))
}
return subs
}

// GetAWSS3 AWS_S3 存储桶的 URL
func GetAWSS3(source string) []string {
var aws []string
for _, match := range AWSS3.FindAllStringSubmatch(source, -1) {
aws = append(aws, DecodeChars(match[0]))
}
return aws
}

robots 中提取:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// ParseRobots 从 robots 中寻找 url
func ParseRobots(site *url.URL, crawler *Crawler, c *colly.Collector, wg *sync.WaitGroup) {
defer wg.Done()
robotsURL := site.String() + "/robots.txt"

resp, err := http.Get(robotsURL)
if err != nil {
return
}
if resp.StatusCode == 200 {
Logger.Infof("Found robots.txt: %s", robotsURL)
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return
}
lines := strings.Split(string(body), "\n")
// 这里的正则匹配 Disallow: Allow: 部分
var re = regexp.MustCompile(".*llow: ")
for _, line := range lines {
if strings.Contains(line, "llow: ") {
// 移除 Dis... 部分 ( 正则 )
url := re.ReplaceAllString(line, "")
url = FixUrl(site, url)
if url == "" {
continue
}
outputFormat := fmt.Sprintf("[robots] - %s", url)

if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "robots",
OutputType: "url",
Output: url,
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
}
} else if crawler.Quiet {
outputFormat = url
}
fmt.Println(outputFormat)
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
_ = c.Visit(url)
}
}
}
}

站点地图,使用 gopher-parse-sitemap 库进行的 sitemap 解析:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
func ParseSiteMap(site *url.URL, crawler *Crawler, c *colly.Collector, wg *sync.WaitGroup) {
defer wg.Done()
sitemapUrls := []string{"/sitemap.xml", "/sitemap_news.xml", "/sitemap_index.xml", "/sitemap-index.xml", "/sitemapindex.xml",
"/sitemap-news.xml", "/post-sitemap.xml", "/page-sitemap.xml", "/portfolio-sitemap.xml", "/home_slider-sitemap.xml", "/category-sitemap.xml",
"/author-sitemap.xml"}

for _, path := range sitemapUrls {
// Ignore error when that not valid sitemap.xml path
Logger.Infof("Trying to find %s", site.String()+path)
_ = sitemap.ParseFromSite(site.String()+path, func(entry sitemap.Entry) error {
outputFormat := fmt.Sprintf("[sitemap] - %s", entry.GetLocation())

if crawler.JsonOutput {
sout := SpiderOutput{
Input: crawler.Input,
Source: "sitemap",
OutputType: "url",
Output: entry.GetLocation(),
}
if data, err := jsoniter.MarshalToString(sout); err == nil {
outputFormat = data
}
} else if crawler.Quiet {
outputFormat = entry.GetLocation()
}
fmt.Println(outputFormat)
if crawler.Output != nil {
crawler.Output.WriteToFile(outputFormat)
}
_ = c.Visit(entry.GetLocation())
return nil
})
}

}

这里还有一些第三方的 url 收集,就是 Wayback Machine、Common Crawl、Virus Total、Alien Vault 这些:

image-20240323033408855

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
func getWaybackURLs(domain string, noSubs bool) ([]wurl, error) {
subsWildcard := "*."
if noSubs {
subsWildcard = ""
}
// 从一个网站存档中获取响应
res, err := http.Get(
fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s%s/*&output=json&collapse=urlkey", subsWildcard, domain),
)
if err != nil {
return []wurl{}, err
}

raw, err := ioutil.ReadAll(res.Body)

res.Body.Close()
if err != nil {
return []wurl{}, err
}

var wrapper [][]string
err = json.Unmarshal(raw, &wrapper)

out := make([]wurl, 0, len(wrapper))

skip := true
for _, urls := range wrapper {
// The first item is always just the string "original",
// so we should skip the first item
if skip {
skip = false
continue
}
out = append(out, wurl{date: urls[1], url: urls[2]})
}

return out, nil

}

学习总结

  1. 学到了 colly 爬虫框架的使用
  2. http.ProxyURL(pU)、ResolveReference

不过还有一些地方可以再优化一下,比如请求队列去重,GoSpider 并没有做这个操作,还有对于 aws-s3 这些响应中敏感信息的提取可以去写一个配置文件来实现这个操作,有助于后期增添新的规则,动态爬虫也可以实现这一操作。

看到网上使用 GoSpider 后会使用 uro 进行链接去重,后面可以看看。


GoSpider 源码学习
https://liancccc.github.io/2024/03/12/技术/源码学习/GoSpider/
作者
守心
发布于
2024年3月12日
许可协议