MicroFish/backend/scripts/test_profile_format.py

"""Profile-format generation tests for OASIS compatibility.

Verifies that:
1. Twitter profiles serialize to CSV format.
2. Reddit profiles serialize to detailed JSON format.
"""

import os
import sys
import json
import csv
import tempfile

# Add the project root to sys.path so the ``app`` package resolves.
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from app.services.oasis_profile_generator import OasisProfileGenerator, OasisAgentProfile


def test_profile_formats():
    """Exercise both profile-format outputs end-to-end."""
    print("=" * 60)
    print("OASIS Profile格式测试")
    print("=" * 60)

    # Build a small set of test profiles.
    test_profiles = [
        OasisAgentProfile(
            user_id=0,
            user_name="test_user_123",
            name="Test User",
            bio="A test user for validation",
            persona="Test User is an enthusiastic participant in social discussions.",
            karma=1500,
            friend_count=100,
            follower_count=200,
            statuses_count=500,
            age=25,
            gender="male",
            mbti="INTJ",
            country="China",
            profession="Student",
            interested_topics=["Technology", "Education"],
            source_entity_uuid="test-uuid-123",
            source_entity_type="Student",
        ),
        OasisAgentProfile(
            user_id=1,
            user_name="org_official_456",
            name="Official Organization",
            bio="Official account for Organization",
            persona="This is an official institutional account that communicates official positions.",
            karma=5000,
            friend_count=50,
            follower_count=10000,
            statuses_count=200,
            profession="Organization",
            interested_topics=["Public Policy", "Announcements"],
            source_entity_uuid="test-uuid-456",
            source_entity_type="University",
        ),
    ]

    generator = OasisProfileGenerator.__new__(OasisProfileGenerator)

    # Use a temp directory for the test fixtures.
    with tempfile.TemporaryDirectory() as temp_dir:
        twitter_path = os.path.join(temp_dir, "twitter_profiles.csv")
        reddit_path = os.path.join(temp_dir, "reddit_profiles.json")

        # Twitter CSV format.
        print("\n1. 测试Twitter Profile (CSV格式)")
        print("-" * 40)
        generator._save_twitter_csv(test_profiles, twitter_path)

        # Read back and verify the CSV.
        with open(twitter_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            rows = list(reader)

        print(f"   文件: {twitter_path}")
        print(f"   行数: {len(rows)}")
        print(f"   表头: {list(rows[0].keys())}")
        print(f"\n   示例数据 (第1行):")
        for key, value in rows[0].items():
            print(f"     {key}: {value}")

        # Verify the required fields are present.
        required_twitter_fields = ['user_id', 'user_name', 'name', 'bio',
                                   'friend_count', 'follower_count', 'statuses_count', 'created_at']
        missing = set(required_twitter_fields) - set(rows[0].keys())
        if missing:
            print(f"\n   [错误] 缺少字段: {missing}")
        else:
            print(f"\n   [通过] 所有必需字段都存在")

        # Reddit JSON format.
        print("\n2. 测试Reddit Profile (JSON详细格式)")
        print("-" * 40)
        generator._save_reddit_json(test_profiles, reddit_path)

        # Read back and verify the JSON.
        with open(reddit_path, 'r', encoding='utf-8') as f:
            reddit_data = json.load(f)

        print(f"   文件: {reddit_path}")
        print(f"   条目数: {len(reddit_data)}")
        print(f"   字段: {list(reddit_data[0].keys())}")
        print(f"\n   示例数据 (第1条):")
        print(json.dumps(reddit_data[0], ensure_ascii=False, indent=4))

        # Verify the detailed Reddit format fields.
        required_reddit_fields = ['realname', 'username', 'bio', 'persona']
        optional_reddit_fields = ['age', 'gender', 'mbti', 'country', 'profession', 'interested_topics']

        missing = set(required_reddit_fields) - set(reddit_data[0].keys())
        if missing:
            print(f"\n   [错误] 缺少必需字段: {missing}")
        else:
            print(f"\n   [通过] 所有必需字段都存在")

        present_optional = set(optional_reddit_fields) & set(reddit_data[0].keys())
        print(f"   [信息] 可选字段: {present_optional}")

    print("\n" + "=" * 60)
    print("测试完成!")
    print("=" * 60)


def show_expected_formats():
    """Print the canonical OASIS-expected profile formats for reference."""
    print("\n" + "=" * 60)
    print("OASIS 期望的Profile格式参考")
    print("=" * 60)

    print("\n1. Twitter Profile (CSV格式)")
    print("-" * 40)
    twitter_example = """user_id,user_name,name,bio,friend_count,follower_count,statuses_count,created_at
0,user0,User Zero,I am user zero with interests in technology.,100,150,500,2023-01-01
1,user1,User One,Tech enthusiast and coffee lover.,200,250,1000,2023-01-02"""
    print(twitter_example)

    print("\n2. Reddit Profile (JSON详细格式)")
    print("-" * 40)
    reddit_example = [
        {
            "realname": "James Miller",
            "username": "millerhospitality",
            "bio": "Passionate about hospitality & tourism.",
            "persona": "James is a seasoned professional in the Hospitality & Tourism industry...",
            "age": 40,
            "gender": "male",
            "mbti": "ESTJ",
            "country": "UK",
            "profession": "Hospitality & Tourism",
            "interested_topics": ["Economics", "Business"]
        }
    ]
    print(json.dumps(reddit_example, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    test_profile_formats()
    show_expected_formats()