Source code for ai_essay_evaluator.trainer.validator

import json



[docs]
def validate_jsonl(jsonl_path: str, scoring_format: str) -> bool:
    """
    Validate JSONL file format for OpenAI fine-tuning.

    Args:
        jsonl_path: Path to the JSONL file to validate
        scoring_format: Scoring format for the JSONL file

    Returns:
        bool: True if file is valid, exits with code 1 otherwise

    """
    try:
        with open(jsonl_path, encoding="utf-8") as f:
            lines = f.readlines()
    except Exception as e:
        print(f"❌ Error reading JSONL file: {e}")
        exit(1)

    for i, line in enumerate(lines):
        try:
            entry = json.loads(line.strip())

            # Ensure required fields exist
            if "messages" not in entry:
                raise ValueError("Missing 'messages' key")

            if not isinstance(entry["messages"], list):
                raise ValueError("'messages' should be a list")

            # Check role structure
            expected_roles = ["system", "user", "assistant"]
            roles = [msg.get("role") for msg in entry["messages"]]

            if roles != expected_roles:
                raise ValueError(f"Incorrect roles sequence in entry {i + 1}: {roles}")
            if scoring_format == "extended":
                required_fields = ["Idea_Development_Score", "Language_Conventions_Score"]
            else:  # short or item-specific
                required_fields = ["Score"]

            # Check assistant's response format
            assistant_msg = entry["messages"][-1]["content"]

            if not all(key in assistant_msg for key in required_fields):
                raise ValueError(f"Missing expected score fields for {scoring_format} format in assistant response")

        except json.JSONDecodeError:
            print(f"❌ Error: Invalid JSON format on line {i + 1}")
            exit(1)
        except ValueError as ve:
            print(f"❌ Error: {ve} (line {i + 1})")
            exit(1)

    print(f"✅ JSONL file '{jsonl_path}' is valid for fine-tuning!")
    return True