feat: bucketed --urls output with strict regex and third-party denylist
The previous --urls mode was a plain grep for "https?://..." which on a
real APK produced thousands of lines, half of them junk strings extracted
from Kotlin stdlib's compression dictionary ("http://An Introduction to..."
fragments) and the other half SDK URLs (Google, Firebase, AppsFlyer,
Datadog, Sentry, ...) that the analyst is not looking for. The signal —
first-party backend hosts — was buried.
Two changes:
1. Strict URL regex: hostname must have at least one dot and end in a 2+
letter TLD, with no whitespace / angle brackets / non-printables in the
path. This eliminates the dictionary-fragment noise.
2. Bucket the surviving URLs into "likely first-party" vs "third-party"
using references/third_party_hosts.txt — a curated denylist of
~80 patterns covering Google/Firebase/Apple/Microsoft/Adobe, attribution
and observability vendors (AppsFlyer, Datadog, Sentry, Bugsnag, ...),
payments (Stripe, PayU, Adyen, ...), support/chat SDKs, CAs, and
standards namespaces (w3.org, etc.).
The new output starts with a frequency-sorted list of likely first-party
hosts — which is the artifact every reverse-engineer wants on the first
page — followed by the collapsed third-party list and the full URL set
for first-party hosts only.
The denylist is a sidecar text file (one regex per line) so users can
extend or override it without editing the script.
This commit is contained in:
parent
dbb19f0a22
commit
2e6fc63453
|
|
@ -0,0 +1,122 @@
|
|||
# Third-party host denylist used by find-api-calls.sh --urls.
|
||||
#
|
||||
# Patterns are extended-regex hostname suffixes / fragments. A host is
|
||||
# considered "third-party noise" if any pattern below matches anywhere
|
||||
# in the hostname. Lines starting with '#' and blank lines are ignored.
|
||||
#
|
||||
# This list is intentionally conservative: when a pattern would hide a
|
||||
# legitimate first-party host (e.g. an app may run its own *.s3.amazonaws.com
|
||||
# bucket), keep the pattern but expect manual review of the bucketed output.
|
||||
|
||||
# Google / Firebase / Play / Crashlytics
|
||||
\.googleapis\.com$
|
||||
\.google\.com$
|
||||
\.gstatic\.com$
|
||||
\.googleusercontent\.com$
|
||||
\.googletagmanager\.com$
|
||||
\.googlesyndication\.com$
|
||||
\.firebaseio\.com$
|
||||
\.firebaseapp\.com$
|
||||
\.firebaseinstallations\.googleapis\.com$
|
||||
\.firebaseremoteconfig\.googleapis\.com$
|
||||
\.crashlytics\.com$
|
||||
\.app-measurement\.com$
|
||||
|
||||
# Apple / Microsoft / Adobe
|
||||
\.apple\.com$
|
||||
\.icloud\.com$
|
||||
\.microsoft\.com$
|
||||
\.live\.com$
|
||||
\.office\.com$
|
||||
\.adobe\.com$
|
||||
ns\.adobe\.com
|
||||
|
||||
# Meta
|
||||
\.facebook\.com$
|
||||
\.fbcdn\.net$
|
||||
\.instagram\.com$
|
||||
\.whatsapp\.com$
|
||||
|
||||
# Other social / messaging / video
|
||||
\.twitter\.com$
|
||||
\.x\.com$
|
||||
\.tiktok\.com$
|
||||
\.youtube\.com$
|
||||
\.youtu\.be$
|
||||
\.linkedin\.com$
|
||||
\.snapchat\.com$
|
||||
\.pinterest\.com$
|
||||
\.reddit\.com$
|
||||
|
||||
# Mobile attribution / analytics / observability
|
||||
\.appsflyersdk\.com$
|
||||
\.appsflyer\.com$
|
||||
\.adjust\.com$
|
||||
\.branch\.io$
|
||||
\.amplitude\.com$
|
||||
\.segment\.com$
|
||||
\.mixpanel\.com$
|
||||
\.hotjar\.com$
|
||||
\.clarity\.ms$
|
||||
\.datadoghq\.(com|eu|us)$
|
||||
\.sentry\.io$
|
||||
\.bugsnag\.com$
|
||||
\.newrelic\.com$
|
||||
\.instabug\.com$
|
||||
\.embrace\.io$
|
||||
\.rollout\.io$
|
||||
\.launchdarkly\.com$
|
||||
|
||||
# Push / notifications
|
||||
\.onesignal\.com$
|
||||
\.urbanairship\.com$
|
||||
\.airship\.com$
|
||||
|
||||
# Support / chat
|
||||
\.zendesk\.com$
|
||||
\.intercom\.io$
|
||||
\.intercomcdn\.com$
|
||||
\.helpshift\.com$
|
||||
\.salesforce\.com$
|
||||
\.freshchat\.com$
|
||||
\.kustomerapp\.com$
|
||||
|
||||
# Payments
|
||||
\.stripe\.com$
|
||||
\.braintreepayments\.com$
|
||||
\.braintreegateway\.com$
|
||||
\.payu\.com$
|
||||
\.payu\.in$
|
||||
\.paypal\.com$
|
||||
\.adyen\.com$
|
||||
\.checkout\.com$
|
||||
\.klarna\.com$
|
||||
|
||||
# Maps / location
|
||||
\.mapbox\.com$
|
||||
\.openstreetmap\.org$
|
||||
|
||||
# Storage / CDN (often third-party even when the bucket name is app-specific)
|
||||
\.s3\.amazonaws\.com$
|
||||
\.cloudfront\.net$
|
||||
\.akamaihd\.net$
|
||||
\.akamaized\.net$
|
||||
\.fastly\.net$
|
||||
\.cloudflare\.com$
|
||||
\.azureedge\.net$
|
||||
|
||||
# DNS / well-known infra
|
||||
\.localhost$
|
||||
^localhost
|
||||
^127\.
|
||||
|
||||
# Standards / RFCs / placeholders that show up as XML/XMP namespaces
|
||||
\.w3\.org$
|
||||
\.w3c\.org$
|
||||
example\.(com|org|net)$
|
||||
|
||||
# Certificate authorities
|
||||
\.sectigo\.com$
|
||||
\.entrust\.com$
|
||||
\.digicert\.com$
|
||||
\.letsencrypt\.org$
|
||||
|
|
@ -158,9 +158,65 @@ if [[ "$SEARCH_ALL" == true || "$SEARCH_PATHS" == true ]]; then
|
|||
fi
|
||||
|
||||
# --- Hardcoded URLs ---
|
||||
# A loose grep for http(s)://... drowns in compression-dictionary garbage and
|
||||
# in third-party SDK URLs (Google, Firebase, AppsFlyer, Datadog, ...). The
|
||||
# strict regex requires a syntactically valid hostname and rejects strings
|
||||
# containing whitespace, angle brackets, or non-printable bytes. Hosts are
|
||||
# then bucketed into "first-party candidates" vs "third-party (denylist)".
|
||||
if [[ "$SEARCH_ALL" == true || "$SEARCH_URLS" == true ]]; then
|
||||
section "Hardcoded URLs (http:// and https://)"
|
||||
run_grep '"https?://[^"]+'
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DENYLIST="$HERE/../references/third_party_hosts.txt"
|
||||
# Hostname must have at least one dot and end in a 2+ letter TLD.
|
||||
STRICT_URL='https?://[A-Za-z0-9-]+(\.[A-Za-z0-9-]+)+\.[A-Za-z]{2,}(:[0-9]{1,5})?(/[^"<>[:space:]]*)?'
|
||||
|
||||
TMP="$(mktemp)"
|
||||
trap 'rm -f "$TMP"' EXIT
|
||||
grep -rhoE --include='*.java' --include='*.kt' "$STRICT_URL" "$SOURCE_DIR" 2>/dev/null \
|
||||
| sort -u > "$TMP"
|
||||
|
||||
# Extract host: strip scheme, take part up to first ':' or '/'.
|
||||
HOSTS_TMP="$(mktemp)"
|
||||
sed -E 's#^https?://##; s#[/:].*$##' "$TMP" | sort -u > "$HOSTS_TMP"
|
||||
|
||||
if [[ -f "$DENYLIST" ]]; then
|
||||
# Build a single combined regex from the denylist (one line each).
|
||||
DENY_REGEX="$(grep -vE '^\s*(#|$)' "$DENYLIST" | tr '\n' '|' | sed 's/|$//')"
|
||||
THIRD_HOSTS=$(grep -E "$DENY_REGEX" "$HOSTS_TMP" || true)
|
||||
FIRST_HOSTS=$(grep -vE "$DENY_REGEX" "$HOSTS_TMP" || true)
|
||||
else
|
||||
THIRD_HOSTS=""
|
||||
FIRST_HOSTS=$(cat "$HOSTS_TMP")
|
||||
fi
|
||||
|
||||
section "Likely First-Party Hosts (frequency-sorted)"
|
||||
if [[ -n "$FIRST_HOSTS" ]]; then
|
||||
while IFS= read -r h; do
|
||||
[[ -z "$h" ]] && continue
|
||||
n=$(grep -cE "://${h//./\\.}([/:\"]|$)" "$TMP" || true)
|
||||
printf ' %5d %s\n' "$n" "$h"
|
||||
done <<< "$FIRST_HOSTS" | sort -rn -k1
|
||||
else
|
||||
echo " (none — every URL matched the third-party denylist)"
|
||||
fi
|
||||
|
||||
section "Third-Party Hosts (denylist matches, collapsed)"
|
||||
if [[ -n "$THIRD_HOSTS" ]]; then
|
||||
echo "$THIRD_HOSTS" | sed 's/^/ /'
|
||||
else
|
||||
echo " (none)"
|
||||
fi
|
||||
|
||||
section "All First-Party URLs (full strings)"
|
||||
if [[ -n "$FIRST_HOSTS" ]]; then
|
||||
while IFS= read -r h; do
|
||||
[[ -z "$h" ]] && continue
|
||||
grep -E "://${h//./\\.}([/:\"]|$)" "$TMP" | sed 's/^/ /'
|
||||
done <<< "$FIRST_HOSTS"
|
||||
fi
|
||||
|
||||
rm -f "$HOSTS_TMP" "$TMP"
|
||||
trap - EXIT
|
||||
|
||||
section "HttpURLConnection"
|
||||
run_grep '(openConnection|setRequestMethod|HttpURLConnection|HttpsURLConnection)'
|
||||
section "WebView URLs"
|
||||
|
|
|
|||
Loading…
Reference in New Issue