subfinder 源码学习

项目介绍

项目地址：https://github.com/projectdiscovery/subfinder

subfinder是一种子域发现工具，它使用被动在线资源返回网站的有效子域。它具有简单的模块化架构，并针对速度进行了优化。专为只做一件事-被动子域枚举，它做得很好。

项目结构

├── passive							
│   ├── passive.go					# 被动收集的主要实现
│   ├── sources.go					# 数据源初始化
├── resolve									
│   ├── client.go					# dns 相关的初始化
│   └── resolve.go					# dns 解析域名, 收集泛解析的 hosts 黑名单, 去除存在泛解析的子域结果
├── runner
│   ├── banners.go
│   ├── config.go					# api key 类数据源的配置文件 序列化 反序列化
│   ├── enumerate.go				# 调用 passive.go 进行单个域名被动收集的实现
│   ├── initialize.go				# 初始化 dns 和 数据源
│   ├── options.go					# 参数解析
│   ├── outputter.go				# 输出
│   ├── runner.go					# 调用 enumerate.go 实现批量子域收集和结果的处理
│   ├── stats.go					# 统计数据源及结果
│   ├── util.go
│   └── validate.go					# 参数验证、正则处理、gologger 日志模式
├── subscraping
│   ├── agent.go					# 封装 http 请求
│   ├── extractor.go				# 正则处理、正则匹配获取子域名
│   ├── sources						# 各种被动收集子域的数据源
│   ├── types.go					# 数据源的接口
│   └── utils.go					# api key 的处理

源码学习

passive.go 就是 subfinder 进行被动收集的实现了：

func (a *Agent) EnumerateSubdomainsWithCtx(ctx context.Context, domain string, proxy string, rateLimit int, timeout int, maxEnumTime time.Duration, options ...EnumerateOption) chan subscraping.Result {
	results := make(chan subscraping.Result)
    // 使用协程去跑主要逻辑 然后直接返回 results 管道，通过管道的阻塞实现了控制，close(results) 没有被执行，那么遍历 results 就会一直阻塞，不会停止。 gofinger 使用的也是这种方式
	go func() {
		defer close(results)
		var enumerateOptions EnumerationOptions
		for _, enumerateOption := range options {
			enumerateOption(&enumerateOptions)
		}
		// 构建速率限制器
		multiRateLimiter, err := a.buildMultiRateLimiter(ctx, rateLimit, enumerateOptions.customRateLimiter)
		if err != nil {
			results <- subscraping.Result{
				Type: subscraping.Error, Error: fmt.Errorf("could not init multi rate limiter for %s: %s", domain, err),
			}
			return
		}
        // 创建一个 session 这个就是 agent.go 中封装的 http 请求模块
		session, err := subscraping.NewSession(domain, proxy, multiRateLimiter, timeout)
		if err != nil {
			results <- subscraping.Result{
				Type: subscraping.Error, Error: fmt.Errorf("could not init passive session for %s: %s", domain, err),
			}
			return
		}
		defer session.Close()
		// 带超时的上下文
		ctx, cancel := context.WithTimeout(ctx, maxEnumTime)

		wg := &sync.WaitGroup{}
		// Run each source in parallel on the target domain
		for _, runner := range a.sources {
			wg.Add(1)
			// 调用每个数据源的 Run 方法 WithValue => 携带数据的上下文 这里携带的是 key => source value => 数据源名称
			go func(source subscraping.Source) {
                // 这是使用 WithValue 的上下文携带数据源的名称 
				ctxWithValue := context.WithValue(ctx, subscraping.CtxSourceArg, source.Name())
                // 这里也是实现的阻塞 Run 里面的 chan 不关闭这里就一直会遍历
				for resp := range source.Run(ctxWithValue, domain, session) {
					results <- resp
				}
				wg.Done()
			}(runner)
		}
		wg.Wait()
		cancel()
	}()
	return results
}

可以看下数据源的 Run 方法：

func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result {
	results := make(chan subscraping.Result)
	s.errors = 0
	s.results = 0
	// 直接使用协程跑 返回这个通道
	go func() {
		defer func(startTime time.Time) {
			s.timeTaken = time.Since(startTime)
			close(results)
		}(time.Now())
		// 构建请求获取域名的结果
		resp, err := session.SimpleGet(ctx, fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/passive_dns", domain))
		if err != nil && resp == nil {
			results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
			s.errors++
			session.DiscardHTTPResponse(resp)
			return
		}
		// 响应结果解析
		var response alienvaultResponse
		// Get the response body and decode
		err = json.NewDecoder(resp.Body).Decode(&response)
		if err != nil {
			results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
			s.errors++
			resp.Body.Close()
			return
		}
		resp.Body.Close()

		if response.Error != "" {
			results <- subscraping.Result{
				Source: s.Name(), Type: subscraping.Error, Error: fmt.Errorf("%s, %s", response.Detail, response.Error),
			}
			return
		}
		// 提取结果到 results 中
		for _, record := range response.PassiveDNS {
			results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: record.Hostname}
			s.results++
		}
	}()

	return results
}

这样使用协程让收集子域名的速度大大增加，对于一些有速率限制的数据源，subfinder 也做有相应的处理：

func (a *Agent) buildMultiRateLimiter(ctx context.Context, globalRateLimit int, rateLimit *subscraping.CustomRateLimit) (*ratelimit.MultiLimiter, error) {
	var multiRateLimiter *ratelimit.MultiLimiter
	var err error
	for _, source := range a.sources {
		var rl uint
		// 如果数据源有速率限制 就使用数据源的 没有就使用默认的
		if sourceRateLimit, ok := rateLimit.Custom.Get(strings.ToLower(source.Name())); ok {
			rl = sourceRateLimitOrDefault(uint(globalRateLimit), sourceRateLimit)
		}
		if rl > 0 {
            // 有速率限制的
			multiRateLimiter, err = addRateLimiter(ctx, multiRateLimiter, source.Name(), rl, time.Second)
		} else {
            // 没有的就 math.MaxUint32  就最大 不需要限制
			multiRateLimiter, err = addRateLimiter(ctx, multiRateLimiter, source.Name(), math.MaxUint32, time.Millisecond)
		}

		if err != nil {
			break
		}
	}
	return multiRateLimiter, err
}

速率限制器会被添加到 session 中，也就是 http 请求那里 HTTPRequest ：

// 获取 ctx 中的数据源名称
sourceName := ctx.Value(CtxSourceArg).(string)
// ratelimit 速率控制
mrlErr := s.MultiRateLimiter.Take(sourceName)
if mrlErr != nil {
    return nil, mrlErr
}

return httpRequestWrapper(s.Client, req)

这个就是 subfinder 被动域名收集的具体实现。

再看一下数据源的处理部分，它是初始化数据源的方式是这样的：

func init() {
	for _, currentSource := range AllSources {
		NameSourceMap[strings.ToLower(currentSource.Name())] = currentSource
	}
}

// New creates a new agent for passive subdomain discovery
// 初始化数据源
// 1. 所有数据源 useAllSources
// 2. 指定的数据源列表 sourceNames
// 3. 前面两种都没有指定就使用默认的数据源 currentSource.IsDefault()
// 4. 支持递归的数据源 可以查询子域名的子域名 useSourcesSupportingRecurse
// 5. 输出一下初始化的 passivetotal map
func New(sourceNames, excludedSourceNames []string, useAllSources, useSourcesSupportingRecurse bool) *Agent {
	sources := make(map[string]subscraping.Source, len(AllSources))

	if useAllSources {
		maps.Copy(sources, NameSourceMap)
	} else {
		if len(sourceNames) > 0 {
			for _, source := range sourceNames {
				if NameSourceMap[source] == nil {
					gologger.Warning().Msgf("There is no source with the name: %s", source)
				} else {
					sources[source] = NameSourceMap[source]
				}
			}
		} else {
			for _, currentSource := range AllSources {
				if currentSource.IsDefault() {
					sources[currentSource.Name()] = currentSource
				}
			}
		}
	}

	if len(excludedSourceNames) > 0 {
		for _, sourceName := range excludedSourceNames {
			delete(sources, sourceName)
		}
	}

	if useSourcesSupportingRecurse {
		for sourceName, source := range sources {
			if !source.HasRecursiveSupport() {
				delete(sources, sourceName)
			}
		}
	}

	gologger.Debug().Msgf(fmt.Sprintf("Selected source(s) for this search: %s", strings.Join(maps.Keys(sources), ", ")))

	for _, currentSource := range sources {
		if warning, ok := sourceWarnings.Get(strings.ToLower(currentSource.Name())); ok {
			gologger.Warning().Msg(warning)
		}
	}

	// Create the agent, insert the sources and remove the excluded sources
	agent := &Agent{sources: maps.Values(sources)}

	return agent
}

然后是 dns 解析部分，使用的是 github.com/projectdiscovery/dnsx/libs/dnsx 完成的 dns 解析，泛解析的处理方式如下：

// InitWildcards inits the wildcard ips array
// dns 解析一个不存在的域名 收集泛解析域名的 hosts 列表
func (r *ResolutionPool) InitWildcards(domain string) error {
	for i := 0; i < maxWildcardChecks; i++ {
		uid := xid.New().String()
		// uid 构造一个不可能存在的子域名
		hosts, _ := r.DNSClient.Lookup(uid + "." + domain)
		if len(hosts) == 0 {
			return fmt.Errorf("%s is not a wildcard domain", domain)
		}

		// Append all wildcard ips found for domains
		for _, host := range hosts {
			r.wildcardIPs[host] = struct{}{}
		}
	}
	return nil
}

// 通过黑名单的方式去排除泛解析域名的实现
func (r *ResolutionPool) resolveWorker() {
	for task := range r.Tasks {
		if !r.removeWildcard {
			r.Results <- Result{Type: Subdomain, Host: task.Host, IP: "", Source: task.Source}
			continue
		}
		hosts, err := r.DNSClient.Lookup(task.Host)
		if err != nil {
			r.Results <- Result{Type: Error, Host: task.Host, Source: task.Source, Error: err}
			continue
		}
		if len(hosts) == 0 {
			continue
		}
		var skip bool
		// 判断该子域名的 host 是否在泛解析的 host 列表中
		for _, host := range hosts {
			// Ignore the host if it exists in wildcard ips map
			if _, ok := r.wildcardIPs[host]; ok {
				skip = true
				break
			}
		}
		if !skip {
			r.Results <- Result{Type: Subdomain, Host: task.Host, IP: hosts[0], Source: task.Source}
		}
	}
	r.wg.Done()
}

然后在看看 subfinder 是如何处理被动收集的结果：

//  1. 判断是否需要进行泛解析处理, 需要的话就收集泛解析域名的 host
//  2. 调用 EnumerateSubdomainsWithCtx 去跑子域名
//  3. 使用协程处理子域名通道中的结果
//     3.1 判断该子域名是否是属于主域名
//     3.2 替换掉 *.
//     3.3 使用 filterAndMatchSubdomain 匹配用户自定义的正则表达式
//     3.4 使用 map 做去重 , 并且使用 map 记录去重后每个数据源的结果数量
//     3.5 将结果转换为 resolve.HostEntry, 如果需要去除泛解析的话就添加到泛解析的任务队列中
//     3.6 等待协程退出 => 处理结果的协程
//     3.7 输出 统计操作
func (r *Runner) EnumerateSingleDomainWithCtx(ctx context.Context, domain string, writers []io.Writer) error {
	gologger.Info().Msgf("Enumerating subdomains for %s\n", domain)

	// Check if the user has asked to remove wildcards explicitly.
	// If yes, create the resolution pool and get the wildcards for the current domain
	var resolutionPool *resolve.ResolutionPool
	if r.options.RemoveWildcard {
		resolutionPool = r.resolverClient.NewResolutionPool(r.options.Threads, r.options.RemoveWildcard)
		err := resolutionPool.InitWildcards(domain)
		if err != nil {
			// Log the error but don't quit.
			gologger.Warning().Msgf("Could not get wildcards for domain %s: %s\n", domain, err)
		}
	}

	// Run the passive subdomain enumeration
	now := time.Now()
	passiveResults := r.passiveAgent.EnumerateSubdomainsWithCtx(ctx, domain, r.options.Proxy, r.options.RateLimit, r.options.Timeout, time.Duration(r.options.MaxEnumerationTime)*time.Minute, passive.WithCustomRateLimit(r.rateLimit))

	wg := &sync.WaitGroup{}
	wg.Add(1)
	// Create a unique map for filtering duplicate subdomains out
	uniqueMap := make(map[string]resolve.HostEntry)
	// Create a map to track sources for each host
	sourceMap := make(map[string]map[string]struct{})
	skippedCounts := make(map[string]int)
	// Process the results in a separate goroutine
	go func() {
		for result := range passiveResults {
			switch result.Type {
			case subscraping.Error:
				gologger.Warning().Msgf("Could not run source %s: %s\n", result.Source, result.Error)
			case subscraping.Subdomain:
				// Validate the subdomain found and remove wildcards from
                // 判断是否为主域名的子域
				if !strings.HasSuffix(result.Value, "."+domain) {
					skippedCounts[result.Source]++
					continue
				}
				subdomain := strings.ReplaceAll(strings.ToLower(result.Value), "*.", "")
				// 正则匹配 提取出来子域名
				if matchSubdomain := r.filterAndMatchSubdomain(subdomain); matchSubdomain {
					if _, ok := uniqueMap[subdomain]; !ok {
						sourceMap[subdomain] = make(map[string]struct{})
					}

					// Log the verbose message about the found subdomain per source
					if _, ok := sourceMap[subdomain][result.Source]; !ok {
						gologger.Verbose().Label(result.Source).Msg(subdomain)
					}

					sourceMap[subdomain][result.Source] = struct{}{}

					// Check if the subdomain is a duplicate. If not,
					// send the subdomain for resolution.
					if _, ok := uniqueMap[subdomain]; ok {
						skippedCounts[result.Source]++
						continue
					}

					hostEntry := resolve.HostEntry{Domain: domain, Host: subdomain, Source: result.Source}

					uniqueMap[subdomain] = hostEntry
					// If the user asked to remove wildcard then send on the resolve
					// queue. Otherwise, if mode is not verbose print the results on
					// the screen as they are discovered.
					if r.options.RemoveWildcard {
						resolutionPool.Tasks <- hostEntry
					}
				}
			}
		}
		// Close the task channel only if wildcards are asked to be removed
		if r.options.RemoveWildcard {
			close(resolutionPool.Tasks)
		}
		wg.Done()
	}()

	// If the user asked to remove wildcards, listen from the results
	// queue and write to the map. At the end, print the found results to the screen
	foundResults := make(map[string]resolve.Result)
	if r.options.RemoveWildcard {
		// Process the results coming from the resolutions pool
		for result := range resolutionPool.Results {
			switch result.Type {
			case resolve.Error:
				gologger.Warning().Msgf("Could not resolve host: %s\n", result.Error)
			case resolve.Subdomain:
				// Add the found subdomain to a map.
				if _, ok := foundResults[result.Host]; !ok {
					foundResults[result.Host] = result
				}
			}
		}
	}
	wg.Wait()
	outputWriter := NewOutputWriter(r.options.JSON)
	// Now output all results in output writers
	// 遍历好所有的 writer 进行输出操作
	var err error
	for _, writer := range writers {
		if r.options.HostIP {
			err = outputWriter.WriteHostIP(domain, foundResults, writer)
		} else {
			if r.options.RemoveWildcard {
				err = outputWriter.WriteHostNoWildcard(domain, foundResults, writer)
			} else {
				if r.options.CaptureSources {
					err = outputWriter.WriteSourceHost(domain, sourceMap, writer)
				} else {
					err = outputWriter.WriteHost(domain, uniqueMap, writer)
				}
			}
		}
		if err != nil {
			gologger.Error().Msgf("Could not write results for %s: %s\n", domain, err)
			return err
		}
	}

	// Show found subdomain count in any case.
	duration := durafmt.Parse(time.Since(now)).LimitFirstN(maxNumCount).String()
	var numberOfSubDomains int
	if r.options.RemoveWildcard {
		numberOfSubDomains = len(foundResults)
	} else {
		numberOfSubDomains = len(uniqueMap)
	}

	if r.options.ResultCallback != nil {
		if r.options.RemoveWildcard {
			for host, result := range foundResults {
				r.options.ResultCallback(&resolve.HostEntry{Domain: host, Host: result.Host, Source: result.Source})
			}
		} else {
			for _, v := range uniqueMap {
				r.options.ResultCallback(&v)
			}
		}
	}
	gologger.Info().Msgf("Found %d subdomains for %s in %s\n", numberOfSubDomains, domain, duration)

	if r.options.Statistics {
		gologger.Info().Msgf("Printing source statistics for %s", domain)
		statistics := r.passiveAgent.GetStatistics()
		// This is a hack to remove the skipped count from the statistics
		// as we don't want to show it in the statistics.
		// TODO: Design a better way to do this.
		for source, count := range skippedCounts {
			if stat, ok := statistics[source]; ok {
				stat.Results -= count
				statistics[source] = stat
			}
		}
		printStatistics(statistics)
	}

	return nil
}

这里它使用了 filterAndMatchSubdomain 去实现了用户自定义拦截、匹配的正则：

func (r *Runner) filterAndMatchSubdomain(subdomain string) bool {
	// 过滤的正则匹配
	if r.options.filterRegexes != nil {
		for _, filter := range r.options.filterRegexes {
			if m := filter.MatchString(subdomain); m {
				return false
			}
		}
	}
	// 结果的正则匹配
	if r.options.matchRegexes != nil {
		for _, match := range r.options.matchRegexes {
			if m := match.MatchString(subdomain); m {
				return true
			}
		}
		return false
	}
    // 默认的直接返回 true
	return true
}

这里一般是空的，所以具体实现子域名提取的是数据源本身，比如 fofa 这里：

if response.Size > 0 {
    for _, subdomain := range response.Results {
        if strings.HasPrefix(strings.ToLower(subdomain), "http://") || strings.HasPrefix(strings.ToLower(subdomain), "https://") {
            subdomain = subdomain[strings.Index(subdomain, "//")+2:]
        }
        re := regexp.MustCompile(`:\d+$`)
        if re.MatchString(subdomain) {
            subdomain = re.ReplaceAllString(subdomain, "")
        }
        results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
        s.results++
    }
}

一些数据源也使用了这种匹配：

最后再看一下 runner.go：

func (r *Runner) EnumerateMultipleDomainsWithCtx(ctx context.Context, reader io.Reader, writers []io.Writer) error {
	scanner := bufio.NewScanner(reader)
	ip, _ := regexp.Compile(`^([0-9\.]+$)`)
    // scanner 遍历 reader 
	for scanner.Scan() {
		domain, err := normalizeLowercase(scanner.Text())
        // 对于 IP 目标
		isIp := ip.MatchString(domain)
		if errors.Is(err, ErrEmptyInput) || (r.options.ExcludeIps && isIp) {
			continue
		}

		var file *os.File
		// If the user has specified an output file, use that output file instead
		// of creating a new output file for each domain. Else create a new file
		// for each domain in the directory.
		if r.options.OutputFile != "" {
			outputWriter := NewOutputWriter(r.options.JSON)
			file, err = outputWriter.createFile(r.options.OutputFile, true)
			if err != nil {
				gologger.Error().Msgf("Could not create file %s for %s: %s\n", r.options.OutputFile, r.options.Domain, err)
				return err
			}

			err = r.EnumerateSingleDomainWithCtx(ctx, domain, append(writers, file))

			file.Close()
		} else if r.options.OutputDirectory != "" {
			outputFile := path.Join(r.options.OutputDirectory, domain)
			if r.options.JSON {
				outputFile += ".json"
			} else {
				outputFile += ".txt"
			}

			outputWriter := NewOutputWriter(r.options.JSON)
			file, err = outputWriter.createFile(outputFile, false)
			if err != nil {
				gologger.Error().Msgf("Could not create file %s for %s: %s\n", r.options.OutputFile, r.options.Domain, err)
				return err
			}
			// 添加一个 writer 类型
			err = r.EnumerateSingleDomainWithCtx(ctx, domain, append(writers, file))

			file.Close()
		} else {
			err = r.EnumerateSingleDomainWithCtx(ctx, domain, writers)
		}
		if err != nil {
			return err
		}
	}
	return nil
}

可以看到它传输目标使用的是 io.Reader 而不是通道或者切片，这种方式的好处挺多的：

多种格式的目标输入
不需要把目标一次性全部添加到内存中而是使用 scanner 去读取

可以看到它对于输入目标的处理：

func (r *Runner) RunEnumerationWithCtx(ctx context.Context) error {
	// 默认的有一个 output 的 writer
	outputs := []io.Writer{r.options.Output}
	// 单个目标
	if len(r.options.Domain) > 0 {
		domainsReader := strings.NewReader(strings.Join(r.options.Domain, "\n"))
		return r.EnumerateMultipleDomainsWithCtx(ctx, domainsReader, outputs)
	}
	// 文件
	// If we have multiple domains as input,
	if r.options.DomainsFile != "" {
		f, err := os.Open(r.options.DomainsFile)
		if err != nil {
			return err
		}
		err = r.EnumerateMultipleDomainsWithCtx(ctx, f, outputs)
		f.Close()
		return err
	}
	// os.Stdin 
	// If we have STDIN input, treat it as multiple domains
	if r.options.Stdin {
		return r.EnumerateMultipleDomainsWithCtx(ctx, os.Stdin, outputs)
	}
	return nil
}

输出那里它使用的是 writers []io.Writer 好处也很多，默认是一个输出，然后可以根据参数去添加一个文件写入流，最后遍历写入 writer 就行了。

学习总结

泛解析黑名单处理
reader、writers

技术

#源码学习

subfinder 源码学习

https://liancccc.github.io/2024/03/10/技术/源码学习/subfinder/

作者

守心

发布于

2024年3月10日

许可协议

crawlergo 源码学习上一篇

fingerprintx 源码学习下一篇