Source code for ai_essay_evaluator.trainer.validator
import json
[docs]
def validate_jsonl(jsonl_path: str, scoring_format: str) -> bool:
"""
Validate JSONL file format for OpenAI fine-tuning.
Args:
jsonl_path: Path to the JSONL file to validate
scoring_format: Scoring format for the JSONL file
Returns:
bool: True if file is valid, exits with code 1 otherwise
"""
try:
with open(jsonl_path, encoding="utf-8") as f:
lines = f.readlines()
except Exception as e:
print(f"❌ Error reading JSONL file: {e}")
exit(1)
for i, line in enumerate(lines):
try:
entry = json.loads(line.strip())
# Ensure required fields exist
if "messages" not in entry:
raise ValueError("Missing 'messages' key")
if not isinstance(entry["messages"], list):
raise ValueError("'messages' should be a list")
# Check role structure
expected_roles = ["system", "user", "assistant"]
roles = [msg.get("role") for msg in entry["messages"]]
if roles != expected_roles:
raise ValueError(f"Incorrect roles sequence in entry {i + 1}: {roles}")
if scoring_format == "extended":
required_fields = ["Idea_Development_Score", "Language_Conventions_Score"]
else: # short or item-specific
required_fields = ["Score"]
# Check assistant's response format
assistant_msg = entry["messages"][-1]["content"]
if not all(key in assistant_msg for key in required_fields):
raise ValueError(f"Missing expected score fields for {scoring_format} format in assistant response")
except json.JSONDecodeError:
print(f"❌ Error: Invalid JSON format on line {i + 1}")
exit(1)
except ValueError as ve:
print(f"❌ Error: {ve} (line {i + 1})")
exit(1)
print(f"✅ JSONL file '{jsonl_path}' is valid for fine-tuning!")
return True