From 2e6fc63453c0158bb294cfb14c0142071127002b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Tajchert?= Date: Wed, 29 Apr 2026 01:23:56 +0200 Subject: [PATCH] feat: bucketed --urls output with strict regex and third-party denylist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous --urls mode was a plain grep for "https?://..." which on a real APK produced thousands of lines, half of them junk strings extracted from Kotlin stdlib's compression dictionary ("http://An Introduction to..." fragments) and the other half SDK URLs (Google, Firebase, AppsFlyer, Datadog, Sentry, ...) that the analyst is not looking for. The signal — first-party backend hosts — was buried. Two changes: 1. Strict URL regex: hostname must have at least one dot and end in a 2+ letter TLD, with no whitespace / angle brackets / non-printables in the path. This eliminates the dictionary-fragment noise. 2. Bucket the surviving URLs into "likely first-party" vs "third-party" using references/third_party_hosts.txt — a curated denylist of ~80 patterns covering Google/Firebase/Apple/Microsoft/Adobe, attribution and observability vendors (AppsFlyer, Datadog, Sentry, Bugsnag, ...), payments (Stripe, PayU, Adyen, ...), support/chat SDKs, CAs, and standards namespaces (w3.org, etc.). The new output starts with a frequency-sorted list of likely first-party hosts — which is the artifact every reverse-engineer wants on the first page — followed by the collapsed third-party list and the full URL set for first-party hosts only. The denylist is a sidecar text file (one regex per line) so users can extend or override it without editing the script. --- .../references/third_party_hosts.txt | 122 ++++++++++++++++++ .../scripts/find-api-calls.sh | 60 ++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 plugins/android-reverse-engineering/skills/android-reverse-engineering/references/third_party_hosts.txt diff --git a/plugins/android-reverse-engineering/skills/android-reverse-engineering/references/third_party_hosts.txt b/plugins/android-reverse-engineering/skills/android-reverse-engineering/references/third_party_hosts.txt new file mode 100644 index 0000000..976636c --- /dev/null +++ b/plugins/android-reverse-engineering/skills/android-reverse-engineering/references/third_party_hosts.txt @@ -0,0 +1,122 @@ +# Third-party host denylist used by find-api-calls.sh --urls. +# +# Patterns are extended-regex hostname suffixes / fragments. A host is +# considered "third-party noise" if any pattern below matches anywhere +# in the hostname. Lines starting with '#' and blank lines are ignored. +# +# This list is intentionally conservative: when a pattern would hide a +# legitimate first-party host (e.g. an app may run its own *.s3.amazonaws.com +# bucket), keep the pattern but expect manual review of the bucketed output. + +# Google / Firebase / Play / Crashlytics +\.googleapis\.com$ +\.google\.com$ +\.gstatic\.com$ +\.googleusercontent\.com$ +\.googletagmanager\.com$ +\.googlesyndication\.com$ +\.firebaseio\.com$ +\.firebaseapp\.com$ +\.firebaseinstallations\.googleapis\.com$ +\.firebaseremoteconfig\.googleapis\.com$ +\.crashlytics\.com$ +\.app-measurement\.com$ + +# Apple / Microsoft / Adobe +\.apple\.com$ +\.icloud\.com$ +\.microsoft\.com$ +\.live\.com$ +\.office\.com$ +\.adobe\.com$ +ns\.adobe\.com + +# Meta +\.facebook\.com$ +\.fbcdn\.net$ +\.instagram\.com$ +\.whatsapp\.com$ + +# Other social / messaging / video +\.twitter\.com$ +\.x\.com$ +\.tiktok\.com$ +\.youtube\.com$ +\.youtu\.be$ +\.linkedin\.com$ +\.snapchat\.com$ +\.pinterest\.com$ +\.reddit\.com$ + +# Mobile attribution / analytics / observability +\.appsflyersdk\.com$ +\.appsflyer\.com$ +\.adjust\.com$ +\.branch\.io$ +\.amplitude\.com$ +\.segment\.com$ +\.mixpanel\.com$ +\.hotjar\.com$ +\.clarity\.ms$ +\.datadoghq\.(com|eu|us)$ +\.sentry\.io$ +\.bugsnag\.com$ +\.newrelic\.com$ +\.instabug\.com$ +\.embrace\.io$ +\.rollout\.io$ +\.launchdarkly\.com$ + +# Push / notifications +\.onesignal\.com$ +\.urbanairship\.com$ +\.airship\.com$ + +# Support / chat +\.zendesk\.com$ +\.intercom\.io$ +\.intercomcdn\.com$ +\.helpshift\.com$ +\.salesforce\.com$ +\.freshchat\.com$ +\.kustomerapp\.com$ + +# Payments +\.stripe\.com$ +\.braintreepayments\.com$ +\.braintreegateway\.com$ +\.payu\.com$ +\.payu\.in$ +\.paypal\.com$ +\.adyen\.com$ +\.checkout\.com$ +\.klarna\.com$ + +# Maps / location +\.mapbox\.com$ +\.openstreetmap\.org$ + +# Storage / CDN (often third-party even when the bucket name is app-specific) +\.s3\.amazonaws\.com$ +\.cloudfront\.net$ +\.akamaihd\.net$ +\.akamaized\.net$ +\.fastly\.net$ +\.cloudflare\.com$ +\.azureedge\.net$ + +# DNS / well-known infra +\.localhost$ +^localhost +^127\. + +# Standards / RFCs / placeholders that show up as XML/XMP namespaces +\.w3\.org$ +\.w3c\.org$ +example\.(com|org|net)$ + +# Certificate authorities +\.sectigo\.com$ +\.entrust\.com$ +\.digicert\.com$ +\.letsencrypt\.org$ diff --git a/plugins/android-reverse-engineering/skills/android-reverse-engineering/scripts/find-api-calls.sh b/plugins/android-reverse-engineering/skills/android-reverse-engineering/scripts/find-api-calls.sh index 092d64f..d557d15 100755 --- a/plugins/android-reverse-engineering/skills/android-reverse-engineering/scripts/find-api-calls.sh +++ b/plugins/android-reverse-engineering/skills/android-reverse-engineering/scripts/find-api-calls.sh @@ -158,9 +158,65 @@ if [[ "$SEARCH_ALL" == true || "$SEARCH_PATHS" == true ]]; then fi # --- Hardcoded URLs --- +# A loose grep for http(s)://... drowns in compression-dictionary garbage and +# in third-party SDK URLs (Google, Firebase, AppsFlyer, Datadog, ...). The +# strict regex requires a syntactically valid hostname and rejects strings +# containing whitespace, angle brackets, or non-printable bytes. Hosts are +# then bucketed into "first-party candidates" vs "third-party (denylist)". if [[ "$SEARCH_ALL" == true || "$SEARCH_URLS" == true ]]; then - section "Hardcoded URLs (http:// and https://)" - run_grep '"https?://[^"]+' + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + DENYLIST="$HERE/../references/third_party_hosts.txt" + # Hostname must have at least one dot and end in a 2+ letter TLD. + STRICT_URL='https?://[A-Za-z0-9-]+(\.[A-Za-z0-9-]+)+\.[A-Za-z]{2,}(:[0-9]{1,5})?(/[^"<>[:space:]]*)?' + + TMP="$(mktemp)" + trap 'rm -f "$TMP"' EXIT + grep -rhoE --include='*.java' --include='*.kt' "$STRICT_URL" "$SOURCE_DIR" 2>/dev/null \ + | sort -u > "$TMP" + + # Extract host: strip scheme, take part up to first ':' or '/'. + HOSTS_TMP="$(mktemp)" + sed -E 's#^https?://##; s#[/:].*$##' "$TMP" | sort -u > "$HOSTS_TMP" + + if [[ -f "$DENYLIST" ]]; then + # Build a single combined regex from the denylist (one line each). + DENY_REGEX="$(grep -vE '^\s*(#|$)' "$DENYLIST" | tr '\n' '|' | sed 's/|$//')" + THIRD_HOSTS=$(grep -E "$DENY_REGEX" "$HOSTS_TMP" || true) + FIRST_HOSTS=$(grep -vE "$DENY_REGEX" "$HOSTS_TMP" || true) + else + THIRD_HOSTS="" + FIRST_HOSTS=$(cat "$HOSTS_TMP") + fi + + section "Likely First-Party Hosts (frequency-sorted)" + if [[ -n "$FIRST_HOSTS" ]]; then + while IFS= read -r h; do + [[ -z "$h" ]] && continue + n=$(grep -cE "://${h//./\\.}([/:\"]|$)" "$TMP" || true) + printf ' %5d %s\n' "$n" "$h" + done <<< "$FIRST_HOSTS" | sort -rn -k1 + else + echo " (none — every URL matched the third-party denylist)" + fi + + section "Third-Party Hosts (denylist matches, collapsed)" + if [[ -n "$THIRD_HOSTS" ]]; then + echo "$THIRD_HOSTS" | sed 's/^/ /' + else + echo " (none)" + fi + + section "All First-Party URLs (full strings)" + if [[ -n "$FIRST_HOSTS" ]]; then + while IFS= read -r h; do + [[ -z "$h" ]] && continue + grep -E "://${h//./\\.}([/:\"]|$)" "$TMP" | sed 's/^/ /' + done <<< "$FIRST_HOSTS" + fi + + rm -f "$HOSTS_TMP" "$TMP" + trap - EXIT + section "HttpURLConnection" run_grep '(openConnection|setRequestMethod|HttpURLConnection|HttpsURLConnection)' section "WebView URLs"