Source code for ai_essay_evaluator.evaluator.utils

import unicodedata
from pathlib import Path

import ftfy
import pandas as pd



[docs]
def normalize_text(text):
    """
    Normalize text to handle encoding issues with special characters.
    """
    if not isinstance(text, str):
        return text

    # Replace specific problematic character sequences
    # Replace specific problematic character sequences
    replacements = [
        ("\u201a\u00c4\u00f4", "'"),  # Smart apostrophe sequence
        ("\u2019", "'"),  # Right single quotation mark
        ("\u2018", "'"),  # Left single quotation mark
    ]

    for old, new in replacements:
        text = text.replace(old, new)

    # Normalize other Unicode characters to their closest ASCII equivalent
    text = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in text if not unicodedata.combining(c))

    return text




[docs]
def normalize_response_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize text columns in a DataFrame.
    """
    df = df.map(lambda x: ftfy.fix_text(x) if isinstance(x, str) else x)

    # Replace NaN in Student Constructed Response with None.
    # Use a fresh object-dtype Series so pandas' str dtype (3.0+) doesn't coerce None back to NaN.
    if "Student Constructed Response" in df.columns:
        col = df["Student Constructed Response"]
        df["Student Constructed Response"] = pd.Series(
            [x if pd.notna(x) else None for x in col],
            index=df.index,
            dtype=object,
        )
    return df




[docs]
def validate_csv(df: pd.DataFrame) -> None:
    required_columns = {"Local Student ID", "Enrolled Grade Level", "Tested Language"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"Missing required columns: {required_columns - set(df.columns)}")




[docs]
def read_text_files(folder: Path) -> dict[str, str]:
    return {
        file.name: normalize_text(file.read_text(encoding="utf-8").strip().replace("\u00a0", " "))
        for file in folder.glob("*.txt")
    }