subfinder 源码学习

date: 2024-03-10 12:08:13

项目介绍

项目地址：https://github.com/projectdiscovery/subfinder

subfinder是一种子域发现工具，它使用被动在线资源返回网站的有效子域。它具有简单的模块化架构，并针对速度进行了优化。专为只做一件事-被动子域枚举，它做得很好。

项目结构

├── passive                         
│   ├── passive.go                  # 被动收集的主要实现
│   ├── sources.go                  # 数据源初始化
├── resolve                                 
│   ├── client.go                   # dns 相关的初始化
│   └── resolve.go                  # dns 解析域名, 收集泛解析的 hosts 黑名单, 去除存在泛解析的子域结果
├── runner
│   ├── banners.go
│   ├── config.go                   # api key 类数据源的配置文件 序列化 反序列化
│   ├── enumerate.go                # 调用 passive.go 进行单个域名被动收集的实现
│   ├── initialize.go               # 初始化 dns 和 数据源
│   ├── options.go                  # 参数解析
│   ├── outputter.go                # 输出
│   ├── runner.go                   # 调用 enumerate.go 实现批量子域收集和结果的处理
│   ├── stats.go                    # 统计数据源及结果
│   ├── util.go
│   └── validate.go                 # 参数验证、正则处理、gologger 日志模式
├── subscraping
│   ├── agent.go                    # 封装 http 请求
│   ├── extractor.go                # 正则处理、正则匹配获取子域名
│   ├── sources                     # 各种被动收集子域的数据源
│   ├── types.go                    # 数据源的接口
│   └── utils.go                    # api key 的处理

源码学习

passive.go 就是 subfinder 进行被动收集的实现了：

func (a *Agent) EnumerateSubdomainsWithCtx(ctx context.Context, domain string, proxy string, rateLimit int, timeout int, maxEnumTime time.Duration, options ...EnumerateOption) chan subscraping.Result {
    results := make(chan subscraping.Result)
    // 使用协程去跑主要逻辑 然后直接返回 results 管道，通过管道的阻塞实现了控制，close(results) 没有被执行，那么遍历 results 就会一直阻塞，不会停止。 gofinger 使用的也是这种方式
    go func() {
        defer close(results)
        var enumerateOptions EnumerationOptions
        for _, enumerateOption := range options {
            enumerateOption(&enumerateOptions)
        }
        // 构建速率限制器
        multiRateLimiter, err := a.buildMultiRateLimiter(ctx, rateLimit, enumerateOptions.customRateLimiter)
        if err != nil {
            results <- subscraping.Result{
                Type: subscraping.Error, Error: fmt.Errorf("could not init multi rate limiter for %s: %s", domain, err),
            }
            return
        }
        // 创建一个 session 这个就是 agent.go 中封装的 http 请求模块
        session, err := subscraping.NewSession(domain, proxy, multiRateLimiter, timeout)
        if err != nil {
            results <- subscraping.Result{
                Type: subscraping.Error, Error: fmt.Errorf("could not init passive session for %s: %s", domain, err),
            }
            return
        }
        defer session.Close()
        // 带超时的上下文
        ctx, cancel := context.WithTimeout(ctx, maxEnumTime)

        wg := &sync.WaitGroup{}
        // Run each source in parallel on the target domain
        for _, runner := range a.sources {
            wg.Add(1)
            // 调用每个数据源的 Run 方法 WithValue => 携带数据的上下文 这里携带的是 key => source value => 数据源名称
            go func(source subscraping.Source) {
                // 这是使用 WithValue 的上下文携带数据源的名称 
                ctxWithValue := context.WithValue(ctx, subscraping.CtxSourceArg, source.Name())
                // 这里也是实现的阻塞 Run 里面的 chan 不关闭这里就一直会遍历
                for resp := range source.Run(ctxWithValue, domain, session) {
                    results <- resp
                }
                wg.Done()
            }(runner)
        }
        wg.Wait()
        cancel()
    }()
    return results
}

可以看下数据源的 Run 方法：

func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result {
    results := make(chan subscraping.Result)
    s.errors = 0
    s.results = 0
    // 直接使用协程跑 返回这个通道
    go func() {
        defer func(startTime time.Time) {
            s.timeTaken = time.Since(startTime)
            close(results)
        }(time.Now())
        // 构建请求获取域名的结果
        resp, err := session.SimpleGet(ctx, fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/passive_dns", domain))
        if err != nil && resp == nil {
            results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
            s.errors++
            session.DiscardHTTPResponse(resp)
            return
        }
        // 响应结果解析
        var response alienvaultResponse
        // Get the response body and decode
        err = json.NewDecoder(resp.Body).Decode(&response)
        if err != nil {
            results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
            s.errors++
            resp.Body.Close()
            return
        }
        resp.Body.Close()

        if response.Error != "" {
            results <- subscraping.Result{
                Source: s.Name(), Type: subscraping.Error, Error: fmt.Errorf("%s, %s", response.Detail, response.Error),
            }
            return
        }
        // 提取结果到 results 中
        for _, record := range response.PassiveDNS {
            results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: record.Hostname}
            s.results++
        }
    }()

    return results
}

这样使用协程让收集子域名的速度大大增加，对于一些有速率限制的数据源，subfinder 也做有相应的处理：

func (a *Agent) buildMultiRateLimiter(ctx context.Context, globalRateLimit int, rateLimit *subscraping.CustomRateLimit) (*ratelimit.MultiLimiter, error) {
    var multiRateLimiter *ratelimit.MultiLimiter
    var err error
    for _, source := range a.sources {
        var rl uint
        // 如果数据源有速率限制 就使用数据源的 没有就使用默认的
        if sourceRateLimit, ok := rateLimit.Custom.Get(strings.ToLower(source.Name())); ok {
            rl = sourceRateLimitOrDefault(uint(globalRateLimit), sourceRateLimit)
        }
        if rl > 0 {
            // 有速率限制的
            multiRateLimiter, err = addRateLimiter(ctx, multiRateLimiter, source.Name(), rl, time.Second)
        } else {
            // 没有的就 math.MaxUint32  就最大 不需要限制
            multiRateLimiter, err = addRateLimiter(ctx, multiRateLimiter, source.Name(), math.MaxUint32, time.Millisecond)
        }

        if err != nil {
            break
        }
    }
    return multiRateLimiter, err
}

速率限制器会被添加到 session 中，也就是 http 请求那里 HTTPRequest ：

// 获取 ctx 中的数据源名称
sourceName := ctx.Value(CtxSourceArg).(string)
// ratelimit 速率控制
mrlErr := s.MultiRateLimiter.Take(sourceName)
if mrlErr != nil {
    return nil, mrlErr
}

return httpRequestWrapper(s.Client, req)

这个就是 subfinder 被动域名收集的具体实现。

再看一下数据源的处理部分，它是初始化数据源的方式是这样的：

func init() {
    for _, currentSource := range AllSources {
        NameSourceMap[strings.ToLower(currentSource.Name())] = currentSource
    }
}

// New creates a new agent for passive subdomain discovery
// 初始化数据源
// 1. 所有数据源 useAllSources
// 2. 指定的数据源列表 sourceNames
// 3. 前面两种都没有指定就使用默认的数据源 currentSource.IsDefault()
// 4. 支持递归的数据源 可以查询子域名的子域名 useSourcesSupportingRecurse
// 5. 输出一下初始化的 passivetotal map
func New(sourceNames, excludedSourceNames []string, useAllSources, useSourcesSupportingRecurse bool) *Agent {
    sources := make(map[string]subscraping.Source, len(AllSources))

    if useAllSources {
        maps.Copy(sources, NameSourceMap)
    } else {
        if len(sourceNames) > 0 {
            for _, source := range sourceNames {
                if NameSourceMap[source] == nil {
                    gologger.Warning().Msgf("There is no source with the name: %s", source)
                } else {
                    sources[source] = NameSourceMap[source]
                }
            }
        } else {
            for _, currentSource := range AllSources {
                if currentSource.IsDefault() {
                    sources[currentSource.Name()] = currentSource
                }
            }
        }
    }

    if len(excludedSourceNames) > 0 {
        for _, sourceName := range excludedSourceNames {
            delete(sources, sourceName)
        }
    }

    if useSourcesSupportingRecurse {
        for sourceName, source := range sources {
            if !source.HasRecursiveSupport() {
                delete(sources, sourceName)
            }
        }
    }

    gologger.Debug().Msgf(fmt.Sprintf("Selected source(s) for this search: %s", strings.Join(maps.Keys(sources), ", ")))

    for _, currentSource := range sources {
        if warning, ok := sourceWarnings.Get(strings.ToLower(currentSource.Name())); ok {
            gologger.Warning().Msg(warning)
        }
    }

    // Create the agent, insert the sources and remove the excluded sources
    agent := &Agent{sources: maps.Values(sources)}

    return agent
}

然后是 dns 解析部分，使用的是 github.com/projectdiscovery/dnsx/libs/dnsx 完成的 dns 解析，泛解析的处理方式如下：

// InitWildcards inits the wildcard ips array
// dns 解析一个不存在的域名 收集泛解析域名的 hosts 列表
func (r *ResolutionPool) InitWildcards(domain string) error {
    for i := 0; i < maxWildcardChecks; i++ {
        uid := xid.New().String()
        // uid 构造一个不可能存在的子域名
        hosts, _ := r.DNSClient.Lookup(uid + "." + domain)
        if len(hosts) == 0 {
            return fmt.Errorf("%s is not a wildcard domain", domain)
        }

        // Append all wildcard ips found for domains
        for _, host := range hosts {
            r.wildcardIPs[host] = struct{}{}
        }
    }
    return nil
}

// 通过黑名单的方式去排除泛解析域名的实现
func (r *ResolutionPool) resolveWorker() {
    for task := range r.Tasks {
        if !r.removeWildcard {
            r.Results <- Result{Type: Subdomain, Host: task.Host, IP: "", Source: task.Source}
            continue
        }
        hosts, err := r.DNSClient.Lookup(task.Host)
        if err != nil {
            r.Results <- Result{Type: Error, Host: task.Host, Source: task.Source, Error: err}
            continue
        }
        if len(hosts) == 0 {
            continue
        }
        var skip bool
        // 判断该子域名的 host 是否在泛解析的 host 列表中
        for _, host := range hosts {
            // Ignore the host if it exists in wildcard ips map
            if _, ok := r.wildcardIPs[host]; ok {
                skip = true
                break
            }
        }
        if !skip {
            r.Results <- Result{Type: Subdomain, Host: task.Host, IP: hosts[0], Source: task.Source}
        }
    }
    r.wg.Done()
}

然后在看看 subfinder 是如何处理被动收集的结果：

//  1. 判断是否需要进行泛解析处理, 需要的话就收集泛解析域名的 host
//  2. 调用 EnumerateSubdomainsWithCtx 去跑子域名
//  3. 使用协程处理子域名通道中的结果
//     3.1 判断该子域名是否是属于主域名
//     3.2 替换掉 *.
//     3.3 使用 filterAndMatchSubdomain 匹配用户自定义的正则表达式
//     3.4 使用 map 做去重 , 并且使用 map 记录去重后每个数据源的结果数量
//     3.5 将结果转换为 resolve.HostEntry, 如果需要去除泛解析的话就添加到泛解析的任务队列中
//     3.6 等待协程退出 => 处理结果的协程
//     3.7 输出 统计操作
func (r *Runner) EnumerateSingleDomainWithCtx(ctx context.Context, domain string, writers []io.Writer) error {
    gologger.Info().Msgf("Enumerating subdomains for %s\n", domain)

    // Check if the user has asked to remove wildcards explicitly.
    // If yes, create the resolution pool and get the wildcards for the current domain
    var resolutionPool *resolve.ResolutionPool
    if r.options.RemoveWildcard {
        resolutionPool = r.resolverClient.NewResolutionPool(r.options.Threads, r.options.RemoveWildcard)
        err := resolutionPool.InitWildcards(domain)
        if err != nil {
            // Log the error but don't quit.
            gologger.Warning().Msgf("Could not get wildcards for domain %s: %s\n", domain, err)
        }
    }

    // Run the passive subdomain enumeration
    now := time.Now()
    passiveResults := r.passiveAgent.EnumerateSubdomainsWithCtx(ctx, domain, r.options.Proxy, r.options.RateLimit, r.options.Timeout, time.Duration(r.options.MaxEnumerationTime)*time.Minute, passive.WithCustomRateLimit(r.rateLimit))

    wg := &sync.WaitGroup{}
    wg.Add(1)
    // Create a unique map for filtering duplicate subdomains out
    uniqueMap := make(map[string]resolve.HostEntry)
    // Create a map to track sources for each host
    sourceMap := make(map[string]map[string]struct{})
    skippedCounts := make(map[string]int)
    // Process the results in a separate goroutine
    go func() {
        for result := range passiveResults {
            switch result.Type {
            case subscraping.Error:
                gologger.Warning().Msgf("Could not run source %s: %s\n", result.Source, result.Error)
            case subscraping.Subdomain:
                // Validate the subdomain found and remove wildcards from
                // 判断是否为主域名的子域
                if !strings.HasSuffix(result.Value, "."+domain) {
                    skippedCounts[result.Source]++
                    continue
                }
                subdomain := strings.ReplaceAll(strings.ToLower(result.Value), "*.", "")
                // 正则匹配 提取出来子域名
                if matchSubdomain := r.filterAndMatchSubdomain(subdomain); matchSubdomain {
                    if _, ok := uniqueMap[subdomain]; !ok {
                        sourceMap[subdomain] = make(map[string]struct{})
                    }

                    // Log the verbose message about the found subdomain per source
                    if _, ok := sourceMap[subdomain][result.Source]; !ok {
                        gologger.Verbose().Label(result.Source).Msg(subdomain)
                    }

                    sourceMap[subdomain][result.Source] = struct{}{}

                    // Check if the subdomain is a duplicate. If not,
                    // send the subdomain for resolution.
                    if _, ok := uniqueMap[subdomain]; ok {
                        skippedCounts[result.Source]++
                        continue
                    }

                    hostEntry := resolve.HostEntry{Domain: domain, Host: subdomain, Source: result.Source}

                    uniqueMap[subdomain] = hostEntry
                    // If the user asked to remove wildcard then send on the resolve
                    // queue. Otherwise, if mode is not verbose print the results on
                    // the screen as they are discovered.
                    if r.options.RemoveWildcard {
                        resolutionPool.Tasks <- hostEntry
                    }
                }
            }
        }
        // Close the task channel only if wildcards are asked to be removed
        if r.options.RemoveWildcard {
            close(resolutionPool.Tasks)
        }
        wg.Done()
    }()

    // If the user asked to remove wildcards, listen from the results
    // queue and write to the map. At the end, print the found results to the screen
    foundResults := make(map[string]resolve.Result)
    if r.options.RemoveWildcard {
        // Process the results coming from the resolutions pool
        for result := range resolutionPool.Results {
            switch result.Type {
            case resolve.Error:
                gologger.Warning().Msgf("Could not resolve host: %s\n", result.Error)
            case resolve.Subdomain:
                // Add the found subdomain to a map.
                if _, ok := foundResults[result.Host]; !ok {
                    foundResults[result.Host] = result
                }
            }
        }
    }
    wg.Wait()
    outputWriter := NewOutputWriter(r.options.JSON)
    // Now output all results in output writers
    // 遍历好所有的 writer 进行输出操作
    var err error
    for _, writer := range writers {
        if r.options.HostIP {
            err = outputWriter.WriteHostIP(domain, foundResults, writer)
        } else {
            if r.options.RemoveWildcard {
                err = outputWriter.WriteHostNoWildcard(domain, foundResults, writer)
            } else {
                if r.options.CaptureSources {
                    err = outputWriter.WriteSourceHost(domain, sourceMap, writer)
                } else {
                    err = outputWriter.WriteHost(domain, uniqueMap, writer)
                }
            }
        }
        if err != nil {
            gologger.Error().Msgf("Could not write results for %s: %s\n", domain, err)
            return err
        }
    }

    // Show found subdomain count in any case.
    duration := durafmt.Parse(time.Since(now)).LimitFirstN(maxNumCount).String()
    var numberOfSubDomains int
    if r.options.RemoveWildcard {
        numberOfSubDomains = len(foundResults)
    } else {
        numberOfSubDomains = len(uniqueMap)
    }

    if r.options.ResultCallback != nil {
        if r.options.RemoveWildcard {
            for host, result := range foundResults {
                r.options.ResultCallback(&resolve.HostEntry{Domain: host, Host: result.Host, Source: result.Source})
            }
        } else {
            for _, v := range uniqueMap {
                r.options.ResultCallback(&v)
            }
        }
    }
    gologger.Info().Msgf("Found %d subdomains for %s in %s\n", numberOfSubDomains, domain, duration)

    if r.options.Statistics {
        gologger.Info().Msgf("Printing source statistics for %s", domain)
        statistics := r.passiveAgent.GetStatistics()
        // This is a hack to remove the skipped count from the statistics
        // as we don't want to show it in the statistics.
        // TODO: Design a better way to do this.
        for source, count := range skippedCounts {
            if stat, ok := statistics[source]; ok {
                stat.Results -= count
                statistics[source] = stat
            }
        }
        printStatistics(statistics)
    }

    return nil
}

这里它使用了 filterAndMatchSubdomain 去实现了用户自定义拦截、匹配的正则：

func (r *Runner) filterAndMatchSubdomain(subdomain string) bool {
    // 过滤的正则匹配
    if r.options.filterRegexes != nil {
        for _, filter := range r.options.filterRegexes {
            if m := filter.MatchString(subdomain); m {
                return false
            }
        }
    }
    // 结果的正则匹配
    if r.options.matchRegexes != nil {
        for _, match := range r.options.matchRegexes {
            if m := match.MatchString(subdomain); m {
                return true
            }
        }
        return false
    }
    // 默认的直接返回 true
    return true
}

这里一般是空的，所以具体实现子域名提取的是数据源本身，比如 fofa 这里：

if response.Size > 0 {
    for _, subdomain := range response.Results {
        if strings.HasPrefix(strings.ToLower(subdomain), "http://") || strings.HasPrefix(strings.ToLower(subdomain), "https://") {
            subdomain = subdomain[strings.Index(subdomain, "//")+2:]
        }
        re := regexp.MustCompile(`:\d+$`)
        if re.MatchString(subdomain) {
            subdomain = re.ReplaceAllString(subdomain, "")
        }
        results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
        s.results++
    }
}

一些数据源也使用了这种匹配：

最后再看一下 runner.go：

func (r *Runner) EnumerateMultipleDomainsWithCtx(ctx context.Context, reader io.Reader, writers []io.Writer) error {
    scanner := bufio.NewScanner(reader)
    ip, _ := regexp.Compile(`^([0-9\.]+$)`)
    // scanner 遍历 reader 
    for scanner.Scan() {
        domain, err := normalizeLowercase(scanner.Text())
        // 对于 IP 目标
        isIp := ip.MatchString(domain)
        if errors.Is(err, ErrEmptyInput) || (r.options.ExcludeIps && isIp) {
            continue
        }

        var file *os.File
        // If the user has specified an output file, use that output file instead
        // of creating a new output file for each domain. Else create a new file
        // for each domain in the directory.
        if r.options.OutputFile != "" {
            outputWriter := NewOutputWriter(r.options.JSON)
            file, err = outputWriter.createFile(r.options.OutputFile, true)
            if err != nil {
                gologger.Error().Msgf("Could not create file %s for %s: %s\n", r.options.OutputFile, r.options.Domain, err)
                return err
            }

            err = r.EnumerateSingleDomainWithCtx(ctx, domain, append(writers, file))

            file.Close()
        } else if r.options.OutputDirectory != "" {
            outputFile := path.Join(r.options.OutputDirectory, domain)
            if r.options.JSON {
                outputFile += ".json"
            } else {
                outputFile += ".txt"
            }

            outputWriter := NewOutputWriter(r.options.JSON)
            file, err = outputWriter.createFile(outputFile, false)
            if err != nil {
                gologger.Error().Msgf("Could not create file %s for %s: %s\n", r.options.OutputFile, r.options.Domain, err)
                return err
            }
            // 添加一个 writer 类型
            err = r.EnumerateSingleDomainWithCtx(ctx, domain, append(writers, file))

            file.Close()
        } else {
            err = r.EnumerateSingleDomainWithCtx(ctx, domain, writers)
        }
        if err != nil {
            return err
        }
    }
    return nil
}

可以看到它传输目标使用的是 io.Reader 而不是通道或者切片，这种方式的好处挺多的：

多种格式的目标输入
不需要把目标一次性全部添加到内存中而是使用 scanner 去读取

可以看到它对于输入目标的处理：

func (r *Runner) RunEnumerationWithCtx(ctx context.Context) error {
    // 默认的有一个 output 的 writer
    outputs := []io.Writer{r.options.Output}
    // 单个目标
    if len(r.options.Domain) > 0 {
        domainsReader := strings.NewReader(strings.Join(r.options.Domain, "\n"))
        return r.EnumerateMultipleDomainsWithCtx(ctx, domainsReader, outputs)
    }
    // 文件
    // If we have multiple domains as input,
    if r.options.DomainsFile != "" {
        f, err := os.Open(r.options.DomainsFile)
        if err != nil {
            return err
        }
        err = r.EnumerateMultipleDomainsWithCtx(ctx, f, outputs)
        f.Close()
        return err
    }
    // os.Stdin 
    // If we have STDIN input, treat it as multiple domains
    if r.options.Stdin {
        return r.EnumerateMultipleDomainsWithCtx(ctx, os.Stdin, outputs)
    }
    return nil
}

输出那里它使用的是 writers []io.Writer 好处也很多，默认是一个输出，然后可以根据参数去添加一个文件写入流，最后遍历写入 writer 就行了。

学习总结

泛解析黑名单处理
reader、writers