Some subdomains were not getting scraped correctly from data source. Added logic to clean these up

This commit is contained in:
fleetcaptain 2021-01-28 22:12:29 -08:00
parent e42290fbab
commit 0f3f8f70b3
1 changed files with 25 additions and 2 deletions

View File

@ -1051,6 +1051,30 @@ def main(domain, threads, savefile, ports, silent, verbose, enable_bruteforce, e
if subdomains: if subdomains:
subdomains = sorted(subdomains, key=subdomain_sorting_key) subdomains = sorted(subdomains, key=subdomain_sorting_key)
# clean up any records that have embedded line breaks
# the more appropriate thing to do is figure out which data source is doing this and update the corresponding collector code
# but this is the quick fix for now
temp_subdomains = []
new_subdomains = []
# for each record, check to make sure it doesn't have <br> in it
for record in subdomains:
record = record.lower()
if ('<br>' in record):
# line breaks, split and add each one to temp_subdomains array
temp_records = record.split('<br>')
for temp_record in temp_records:
temp_subdomains.append(temp_record)
else:
# no issues noted with this record, add to new_subdomains array
new_subdomains.append(record)
# merge temp_subdomains and new_subdomains
for tr in temp_subdomains:
new_subdomains.append(tr)
# finally, replace subdomains with the cleaned up new_subdomains array
# deduplicate the list while we're at it
subdomains = list(dict.fromkeys(new_subdomains))
if savefile: if savefile:
write_file(savefile, subdomains) write_file(savefile, subdomains)
@ -1217,8 +1241,7 @@ if __name__ == "__main__":
f.close() f.close()
else: else:
res = main(domain, threads, savefile, ports, silent, verbose=verbose, enable_bruteforce=enable_bruteforce, engines=engines, quiet=quiet) res = main(domain, threads, savefile, ports, silent, verbose=verbose, enable_bruteforce=enable_bruteforce, engines=engines, quiet=quiet)
# Code added here
if (analyze): if (analyze):
# res is the list of subdomains e.g. www.example.com, mail.example.com, etc # res is the list of subdomains e.g. www.example.com, mail.example.com, etc
if not silent: if not silent: