import json
import csv
import sys
import argparse
import os

# --- Pre-computation and Validation ---

# Define essential columns for the locations CSV file.
REQUIRED_CSV_COLUMNS = {'City', 'State', 'Zip Code', 'Street Address', 'Latitude', 'Longitude'}

# Define recommended keys for a valid JobPosting schema.
RECOMMENDED_SCHEMA_KEYS = {'@type', 'title', 'description', 'hiringOrganization'}

# --- Low-Level Readers (Memory Efficient) ---

def sanitize_json_string(buffer: str) -> str:
    """
    Scans a string buffer and escapes illegal control characters
    (like newlines and tabs) that are inside JSON string literals.
    """
    sanitized_chars = []
    in_string = False
    i = 0
    while i < len(buffer):
        char = buffer[i]
        if char == '"':
            # This check handles escaped quotes like \"
            backslashes = 0
            j = i - 1
            while j >= 0 and buffer[j] == '\\':
                backslashes += 1
                j -= 1
            if backslashes % 2 == 0:
                in_string = not in_string
        if in_string and char != '"':
            if char == '\n':
                sanitized_chars.append('\\n')
            elif char == '\r':
                sanitized_chars.append('\\r')
            elif char == '\t':
                sanitized_chars.append('\\t')
            else:
                sanitized_chars.append(char)
        else:
            sanitized_chars.append(char)
        i += 1
    return "".join(sanitized_chars)

def multi_json_block_reader(file_path):
    """
    Yields one full JSON object/array at a time from a file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            buffer = ""
            depth = 0
            in_object = False
            line_number = 1
            for char in iter(lambda: f.read(1), ''):
                if char == '\n':
                    line_number += 1
                if char in '{[':
                    if not in_object:
                        depth = 0
                    depth += 1
                    in_object = True
                if in_object:
                    buffer += char
                if char in '}]':
                    if in_object:
                        depth -= 1
                        if depth == 0 and buffer.strip():
                            try:
                                sanitized_buffer = sanitize_json_string(buffer)
                                yield json.loads(sanitized_buffer)
                            except json.JSONDecodeError as e:
                                print(f"⚠️ JSON Decode Error near line {line_number}: {e}")
                                print("   Skipping this malformed JSON block.\n---")
                            buffer = ""
                            in_object = False
            if buffer.strip(): 
                try:
                    sanitized_buffer = sanitize_json_string(buffer)
                    yield json.loads(sanitized_buffer)
                except json.JSONDecodeError as e:
                    print(f"⚠️ JSON Decode Error at end of file: {e}")
                    print("   Skipping this malformed JSON block.\n---")
    except FileNotFoundError:
        print(f"❌ Error: The schemas file '{file_path}' was not found. Aborting.")
        sys.exit(1)

# --- Main Logic ---

def load_and_prepare_schemas(file_path):
    print(f"📄 Loading and preparing schemas from '{file_path}'...")
    prepared_schemas = []
    
    # Check if file exists before trying to read
    if not os.path.exists(file_path):
        print(f"❌ Error: Schema file '{file_path}' does not exist.")
        sys.exit(1)

    for i, schema in enumerate(multi_json_block_reader(file_path)):
        if isinstance(schema, dict) and schema.get("@type") == "JobPosting":
            missing_keys = RECOMMENDED_SCHEMA_KEYS - set(schema.keys())
            if missing_keys:
                print(f"⚠️ Warning: Schema #{i+1} is missing recommended keys: {', '.join(missing_keys)}")
            
            schema.pop('geo', None)
            schema.pop('jobLocation', None)
            prepared_schemas.append(schema)
    
    if not prepared_schemas:
        print("❌ No valid 'JobPosting' schemas found. Output file will be empty.")
    else:
        print(f"👍 Found {len(prepared_schemas)} JobPosting schemas to process.")
        
    return prepared_schemas

def process_locations_stream(locations_path, schemas, output_path):
    schemas_written = 0
    
    # Handle missing location file gracefully (warn but output basic schemas)
    if not os.path.exists(locations_path):
        print(f"✅ Note: Location file '{locations_path}' not found.")
        print(f"   Writing {len(schemas)} base schemas without location data...")
        with open(output_path, 'w', encoding='utf-8') as fout:
            for schema in schemas:
                fout.write(json.dumps(schema) + '\n')
                schemas_written += 1
        return schemas_written

    try:
        with open(locations_path, newline='', encoding='utf-8') as csvfile:
            first_line = csvfile.readline()
            if not first_line:
                print(f"✅ Note: Location file '{locations_path}' is empty.")
                with open(output_path, 'w', encoding='utf-8') as fout:
                    for schema in schemas:
                        fout.write(json.dumps(schema) + '\n')
                        schemas_written += 1
                return schemas_written

            csvfile.seek(0)
            reader = csv.DictReader(csvfile)
            
            header = reader.fieldnames or []
            missing_cols = REQUIRED_CSV_COLUMNS - set(header)
            if missing_cols:
                print(f"⚠️ Warning: '{locations_path}' is missing expected columns: {', '.join(missing_cols)}")

            with open(output_path, 'w', encoding='utf-8') as fout:
                print(f"⚙️ Processing locations from '{locations_path}' -> '{output_path}'...")
                for row in reader:
                    for base_schema in schemas:
                        schema_copy = base_schema.copy()
                        
                        schema_copy['jobLocation'] = {
                            "@type": "Place",
                            "address": {
                                "@type": "PostalAddress",
                                "streetAddress": row.get('Street Address', ''),
                                "addressLocality": row.get('City', ''),
                                "addressRegion": row.get('State', ''),
                                "postalCode": row.get('Zip Code', ''),
                                "addressCountry": row.get('addressCountry', 'US')
                            }
                        }
                        
                        latitude = row.get('Latitude')
                        longitude = row.get('Longitude')
                        if latitude and longitude:
                            schema_copy['jobLocation']['geo'] = {
                                "@type": "GeoCoordinates",
                                "latitude": latitude,
                                "longitude": longitude
                            }
                            
                        fout.write(json.dumps(schema_copy) + '\n')
                        schemas_written += 1
            
            return schemas_written

    except Exception as e:
        print(f"❌ Error processing locations: {e}")
        return 0

# --- Main Execution ---
if __name__ == "__main__":
    # Initialize Argument Parser
    parser = argparse.ArgumentParser(description="Generate Geo-Specific Job Schemas.")
    
    # Add arguments
    parser.add_argument("-s", "--schemas", type=str, default="schemas.ndjson",
                        help="Path to the input NDJSON schema file (default: schemas.ndjson)")
    parser.add_argument("-l", "--locations", type=str, default="locations-geo.csv",
                        help="Path to the input CSV locations file (default: locations-geo.csv)")
    parser.add_argument("-o", "--output", type=str, default="all-schemas.ndjson",
                        help="Path to the output NDJSON file (default: all-schemas.ndjson)")

    # Parse arguments
    args = parser.parse_args()

    print("🚀 Starting optimized schema multiplication process...")
    print(f"   Input Schemas:   {args.schemas}")
    print(f"   Input Locations: {args.locations}")
    print(f"   Output File:     {args.output}\n")

    # 1. Load all schemas
    base_schemas = load_and_prepare_schemas(args.schemas)

    # 2. Process locations
    if base_schemas:
        schemas_written = process_locations_stream(args.locations, base_schemas, args.output)
        print(f"\n✅ Done! {schemas_written} schemas saved to '{args.output}'")
    else:
        print("\n⏹️ Process finished. No schemas were written.")