android-reverse-engineering.../plugins/android-reverse-engineering/skills/sdk-neutralizer/scripts/registry-scan.py

518 lines
17 KiB
Python
Executable File

#!/usr/bin/env python3
"""registry-scan.py — Scan decoded APK against SDK registry for neutralization targets.
Loads SDK registry JSONs, scans smali directories for matching packages,
and generates targets-file + manifest-components-file for neutralize.sh.
Also discovers unknown SDK packages not covered by the registry.
Requires Python 3.6+. No external dependencies.
Exit codes:
0 — success (matches found)
1 — error (invalid input, missing files)
2 — no matches found
"""
import argparse
import json
import os
import re
import sys
from collections import defaultdict
from fnmatch import fnmatch
from pathlib import Path
# =====================================================================
# Constants
# =====================================================================
# Category mapping for --category filter
ADS_CATEGORIES = {"ads", "ads_mediation"}
TRACKER_CATEGORIES = {"analytics", "attribution", "crash_reporting", "social"}
# Known non-SDK library packages to exclude from unknown discovery
KNOWN_LIBRARY_PACKAGES = {
"android", "androidx", "kotlin", "kotlinx",
"com/google/protobuf", "com/google/gson", "com/google/common",
"com/google/android/material", "com/google/android/play",
"com/google/android/exoplayer", "com/google/android/exoplayer2",
"com/google/android/datatransport", "com/google/android/recaptcha",
"com/google/crypto", "com/google/flatbuffers",
"com/squareup/okhttp3", "com/squareup/okhttp", "com/squareup/moshi",
"com/squareup/wire", "com/squareup/picasso",
"okhttp3", "okio",
"retrofit2", "retrofit",
"com/jakewharton",
"dagger", "com/google/dagger",
"javax/inject", "javax/annotation",
"io/reactivex", "io/reactivex/rxjava3",
"com/bumptech/glide",
"com/airbnb/lottie",
"org/json", "org/intellij", "org/jetbrains",
"org/apache",
"com/google/firebase/components", "com/google/firebase/inject",
"com/google/firebase/encoders", "com/google/firebase/installations",
"com/google/firebase/sessions", "com/google/firebase/messaging",
"com/google/firebase/datatransport",
"com/google/android/gms/base", "com/google/android/gms/common",
"com/google/android/gms/tasks", "com/google/android/gms/flags",
"com/google/android/gms/dynamic", "com/google/android/gms/dynamite",
"com/google/android/gms/security", "com/google/android/gms/cloudmessaging",
"com/google/android/gms/phenotype",
"com/google/android/ump",
"bolts",
"com/github",
}
# Minimum class count to consider a package as a potential SDK
MIN_CLASSES_FOR_UNKNOWN = 10
# Minimum package depth (segments) to consider as SDK (not obfuscated)
MIN_PACKAGE_DEPTH = 3
# =====================================================================
# Registry loading
# =====================================================================
def load_registry(registry_dir):
"""Load all SDK JSON files from registry directory."""
sdks = []
registry_path = Path(registry_dir)
if not registry_path.is_dir():
print(f"Error: Registry directory not found: {registry_dir}", file=sys.stderr)
sys.exit(1)
for json_file in sorted(registry_path.glob("*.json")):
if json_file.name.startswith("_"):
continue
try:
with open(json_file, "r", encoding="utf-8") as f:
sdk = json.load(f)
sdks.append(sdk)
except (json.JSONDecodeError, IOError) as e:
print(f"Warning: Skipping {json_file.name}: {e}", file=sys.stderr)
return sdks
def filter_sdks_by_category(sdks, category):
"""Filter SDKs by category: ads, trackers, or all."""
if category == "all":
return sdks
elif category == "ads":
return [s for s in sdks if s.get("category") in ADS_CATEGORIES]
elif category == "trackers":
return [s for s in sdks if s.get("category") in TRACKER_CATEGORIES]
return sdks
# =====================================================================
# Smali scanning
# =====================================================================
def find_smali_dirs(decoded_dir):
"""Find all smali directories (multidex support)."""
decoded = Path(decoded_dir)
dirs = sorted(decoded.glob("smali*"))
return [d for d in dirs if d.is_dir()]
def get_app_package(decoded_dir):
"""Extract app package from AndroidManifest.xml."""
manifest = Path(decoded_dir) / "AndroidManifest.xml"
if not manifest.exists():
return None
try:
content = manifest.read_text(encoding="utf-8")
match = re.search(r'<manifest[^>]+package="([^"]+)"', content)
if match:
return match.group(1).replace(".", "/")
except IOError:
pass
return None
def scan_packages(smali_dirs):
"""Scan smali directories and return a map of package -> class count."""
package_classes = defaultdict(int)
for smali_dir in smali_dirs:
for smali_file in smali_dir.rglob("*.smali"):
rel = smali_file.relative_to(smali_dir)
parts = rel.parts[:-1] # directory parts = package
if parts:
package = "/".join(parts)
package_classes[package] += 1
return package_classes
def check_package_exists(smali_dirs, package_dot):
"""Check if a dot-separated package exists in any smali directory."""
package_path = package_dot.replace(".", "/")
for smali_dir in smali_dirs:
pkg_dir = smali_dir / package_path
if pkg_dir.is_dir():
return True
return False
def find_sdk_matches(sdks, smali_dirs):
"""Match registry SDKs against smali directories. Returns list of (sdk, matched_packages)."""
matches = []
for sdk in sdks:
matched_pkgs = []
for pkg in sdk.get("packages", []):
if check_package_exists(smali_dirs, pkg):
matched_pkgs.append(pkg)
if matched_pkgs:
matches.append((sdk, matched_pkgs))
return matches
# =====================================================================
# Target generation
# =====================================================================
def java_to_smali_class(java_class):
"""Convert Java class name to smali path: com.example.Foo -> com/example/Foo"""
return java_class.replace(".", "/")
def generate_targets(sdk, depth):
"""Generate targets-file lines for a matched SDK at given depth level.
Depth 1: entry_points only
Depth 2: entry_points + ad_operations
Depth 3: entry_points + ad_operations + deep_patterns
"""
lines = []
targets = sdk.get("targets", {})
protected = sdk.get("protected_patterns", [])
sdk_name = sdk.get("display_name", sdk.get("sdk_id", "Unknown"))
# Collect protected method patterns
protected_methods = set()
for pp in protected:
pattern = pp.get("pattern", "")
# Extract method name from patterns like "*.getActivity()*"
m = re.search(r'\*\.(\w+)\(', pattern)
if m:
protected_methods.add(m.group(1))
# Level 1: entry_points
for class_target in targets.get("entry_points", []):
cls = java_to_smali_class(class_target["class"])
for method in class_target.get("methods", []):
method_name = method["name"]
if method_name in protected_methods:
continue
lines.append(f"# [{sdk_name}] entry_point")
lines.append(f"{cls}:{method_name}")
# Level 2: ad_operations
if depth >= 2:
for class_target in targets.get("ad_operations", []):
cls = java_to_smali_class(class_target["class"])
for method in class_target.get("methods", []):
method_name = method["name"]
if method_name in protected_methods:
continue
lines.append(f"# [{sdk_name}] ad_operation")
lines.append(f"{cls}:{method_name}")
# Level 3: deep_patterns (package wildcards)
if depth >= 3:
for dp in targets.get("deep_patterns", []):
glob_pattern = dp["package_glob"]
# Convert "com.example.pkg.**" -> "com/example/pkg/**:*"
pkg_path = glob_pattern.replace(".", "/").rstrip("*").rstrip("/")
lines.append(f"# [{sdk_name}] deep_pattern: {dp.get('rule', 'stub_all_void')}")
lines.append(f"{pkg_path}/**:*")
return lines
def generate_manifest_components(sdk):
"""Generate manifest-components-file lines for a matched SDK."""
lines = []
sdk_name = sdk.get("display_name", sdk.get("sdk_id", "Unknown"))
for comp in sdk.get("manifest_components", []):
cls = comp["class"]
lines.append(f"{cls}|{sdk_name}")
return lines
# =====================================================================
# Unknown package discovery
# =====================================================================
def is_obfuscated_package(package):
"""Check if a package looks obfuscated (single-letter segments, too short)."""
parts = package.split("/")
# Single-letter segments are obfuscated (a/, b/c/, etc.)
if all(len(p) <= 2 for p in parts):
return True
# First two segments are single letters
if len(parts) >= 2 and len(parts[0]) <= 1 and len(parts[1]) <= 1:
return True
return False
def is_known_library(package):
"""Check if a package matches known non-SDK libraries."""
for known in KNOWN_LIBRARY_PACKAGES:
if package == known or package.startswith(known + "/"):
return True
return False
def find_unknown_packages(package_classes, matched_sdk_packages, app_package):
"""Find packages that might be unknown SDKs.
Filters:
- Exclude matched SDK packages
- Exclude app package
- Exclude known libraries (androidx, kotlin, okhttp, etc.)
- Exclude obfuscated packages (single-letter names)
- Minimum class count threshold
- Minimum package depth (3+ segments)
"""
# Build set of all matched SDK root packages (as paths)
sdk_roots = set()
for pkg_dot in matched_sdk_packages:
sdk_roots.add(pkg_dot.replace(".", "/"))
unknowns = []
# Aggregate class counts at the 3-segment level for better grouping
aggregated = defaultdict(int)
for pkg, count in package_classes.items():
parts = pkg.split("/")
if len(parts) >= 3:
root = "/".join(parts[:3])
else:
root = pkg
aggregated[root] += count
for package, class_count in sorted(aggregated.items(), key=lambda x: -x[1]):
# Check minimum class count
if class_count < MIN_CLASSES_FOR_UNKNOWN:
continue
# Check package depth
parts = package.split("/")
if len(parts) < MIN_PACKAGE_DEPTH:
continue
# Skip obfuscated
if is_obfuscated_package(package):
continue
# Skip known libraries
if is_known_library(package):
continue
# Skip app package
if app_package and (package == app_package or package.startswith(app_package + "/")):
continue
# Skip matched SDK packages
is_matched = False
for sdk_root in sdk_roots:
if package == sdk_root or package.startswith(sdk_root + "/"):
is_matched = True
break
if is_matched:
continue
unknowns.append((package, class_count))
return unknowns
# =====================================================================
# Report generation
# =====================================================================
def generate_report(matches, unknowns, depth, category):
"""Generate JSON report of scan results."""
report = {
"scan_config": {
"depth": depth,
"category": category,
},
"matched_sdks": [],
"unknown_packages": [],
}
for sdk, matched_pkgs in matches:
targets = sdk.get("targets", {})
n_entry = sum(len(ct.get("methods", [])) for ct in targets.get("entry_points", []))
n_ops = sum(len(ct.get("methods", [])) for ct in targets.get("ad_operations", []))
n_deep = len(targets.get("deep_patterns", []))
n_manifest = len(sdk.get("manifest_components", []))
# Count targets at current depth
n_targets = n_entry
if depth >= 2:
n_targets += n_ops
if depth >= 3:
n_targets += n_deep
report["matched_sdks"].append({
"sdk_id": sdk["sdk_id"],
"display_name": sdk["display_name"],
"category": sdk["category"],
"matched_packages": matched_pkgs,
"targets_count": n_targets,
"manifest_components_count": n_manifest,
"depth_breakdown": {
"entry_points": n_entry,
"ad_operations": n_ops,
"deep_patterns": n_deep,
},
})
for package, class_count in unknowns:
report["unknown_packages"].append({
"package": package,
"class_count": class_count,
})
return report
# =====================================================================
# Main
# =====================================================================
def main():
parser = argparse.ArgumentParser(
description="Scan decoded APK against SDK registry for neutralization targets."
)
parser.add_argument("decoded_dir", help="Path to apktool-decoded APK directory")
parser.add_argument(
"--registry", required=True,
help="Path to SDK registry directory containing JSON files"
)
parser.add_argument(
"--depth", type=int, choices=[1, 2, 3], default=1,
help="Neutralization depth: 1=entry_points, 2=+ad_operations, 3=+deep_patterns (default: 1)"
)
parser.add_argument(
"--category", choices=["ads", "trackers", "all"], default="all",
help="Filter SDKs by category (default: all)"
)
parser.add_argument(
"--output-dir",
help="Output directory for generated files (default: decoded-dir)"
)
args = parser.parse_args()
decoded_dir = args.decoded_dir
output_dir = args.output_dir or decoded_dir
# Validate decoded directory
if not os.path.isdir(decoded_dir):
print(f"Error: Directory not found: {decoded_dir}", file=sys.stderr)
sys.exit(1)
# Find smali directories
smali_dirs = find_smali_dirs(decoded_dir)
if not smali_dirs:
print(f"Error: No smali/ directory found in {decoded_dir}", file=sys.stderr)
sys.exit(1)
# Load and filter registry
sdks = load_registry(args.registry)
if not sdks:
print("Error: No SDK entries loaded from registry", file=sys.stderr)
sys.exit(1)
sdks = filter_sdks_by_category(sdks, args.category)
# Scan for package existence
package_classes = scan_packages(smali_dirs)
app_package = get_app_package(decoded_dir)
# Find SDK matches
matches = find_sdk_matches(sdks, smali_dirs)
# Collect all matched SDK packages for unknown discovery
all_matched_packages = set()
for sdk, matched_pkgs in matches:
for pkg in sdk.get("packages", []):
all_matched_packages.add(pkg)
# Generate targets file
targets_lines = []
targets_lines.append("# Auto-generated by registry-scan.py")
targets_lines.append(f"# Depth: {args.depth} | Category: {args.category}")
targets_lines.append("")
for sdk, matched_pkgs in matches:
sdk_targets = generate_targets(sdk, args.depth)
if sdk_targets:
targets_lines.extend(sdk_targets)
targets_lines.append("")
# Generate manifest components file
manifest_lines = []
for sdk, matched_pkgs in matches:
manifest_lines.extend(generate_manifest_components(sdk))
# Find unknown packages
unknowns = find_unknown_packages(package_classes, all_matched_packages, app_package)
# Generate report
report = generate_report(matches, unknowns, args.depth, args.category)
# Write output files
os.makedirs(output_dir, exist_ok=True)
targets_path = os.path.join(output_dir, "registry-targets.txt")
with open(targets_path, "w", encoding="utf-8") as f:
f.write("\n".join(targets_lines) + "\n")
manifest_path = os.path.join(output_dir, "registry-manifest.txt")
with open(manifest_path, "w", encoding="utf-8") as f:
f.write("\n".join(manifest_lines) + "\n")
report_path = os.path.join(output_dir, "registry-report.json")
with open(report_path, "w", encoding="utf-8") as f:
json.dump(report, f, indent=2, ensure_ascii=False)
f.write("\n")
# Machine-readable stdout
for sdk, matched_pkgs in matches:
targets = sdk.get("targets", {})
n_entry = sum(len(ct.get("methods", [])) for ct in targets.get("entry_points", []))
n_ops = sum(len(ct.get("methods", [])) for ct in targets.get("ad_operations", []))
n_deep = len(targets.get("deep_patterns", []))
n_targets = n_entry
if args.depth >= 2:
n_targets += n_ops
if args.depth >= 3:
n_targets += n_deep
print(f"MATCHED:{sdk['sdk_id']}:{sdk['display_name']}:{sdk['category']}:{n_targets}")
for package, class_count in unknowns:
print(f"UNKNOWN_PACKAGE:{package}:{class_count}")
print(f"REGISTRY_TARGETS:{targets_path}")
print(f"REGISTRY_MANIFEST:{manifest_path}")
if not matches:
sys.exit(2)
sys.exit(0)
if __name__ == "__main__":
main()