From 0f3f8f70b3edc0914b432c39fc355159c4eaa063 Mon Sep 17 00:00:00 2001
From: fleetcaptain <17651144+fleetcaptain@users.noreply.github.com>
Date: Thu, 28 Jan 2021 22:12:29 -0800
Subject: [PATCH] Some subdomains were not getting scraped correctly from data
source. Added logic to clean these up
---
turbolist3r.py | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)
diff --git a/turbolist3r.py b/turbolist3r.py
index c0ae8de..947565d 100644
--- a/turbolist3r.py
+++ b/turbolist3r.py
@@ -1051,6 +1051,30 @@ def main(domain, threads, savefile, ports, silent, verbose, enable_bruteforce, e
if subdomains:
subdomains = sorted(subdomains, key=subdomain_sorting_key)
+ # clean up any records that have embedded line breaks
+ # the more appropriate thing to do is figure out which data source is doing this and update the corresponding collector code
+ # but this is the quick fix for now
+ temp_subdomains = []
+ new_subdomains = []
+ # for each record, check to make sure it doesn't have
in it
+ for record in subdomains:
+ record = record.lower()
+ if ('
' in record):
+ # line breaks, split and add each one to temp_subdomains array
+ temp_records = record.split('
')
+ for temp_record in temp_records:
+ temp_subdomains.append(temp_record)
+ else:
+ # no issues noted with this record, add to new_subdomains array
+ new_subdomains.append(record)
+ # merge temp_subdomains and new_subdomains
+ for tr in temp_subdomains:
+ new_subdomains.append(tr)
+ # finally, replace subdomains with the cleaned up new_subdomains array
+ # deduplicate the list while we're at it
+ subdomains = list(dict.fromkeys(new_subdomains))
+
+
if savefile:
write_file(savefile, subdomains)
@@ -1217,8 +1241,7 @@ if __name__ == "__main__":
f.close()
else:
res = main(domain, threads, savefile, ports, silent, verbose=verbose, enable_bruteforce=enable_bruteforce, engines=engines, quiet=quiet)
-
- # Code added here
+
if (analyze):
# res is the list of subdomains e.g. www.example.com, mail.example.com, etc
if not silent: