Source code for ai_essay_evaluator.evaluator.utils
import unicodedata
from pathlib import Path
import ftfy
import pandas as pd
[docs]
def normalize_text(text):
"""
Normalize text to handle encoding issues with special characters.
"""
if not isinstance(text, str):
return text
# Replace specific problematic character sequences
# Replace specific problematic character sequences
replacements = [
("\u201a\u00c4\u00f4", "'"), # Smart apostrophe sequence
("\u2019", "'"), # Right single quotation mark
("\u2018", "'"), # Left single quotation mark
]
for old, new in replacements:
text = text.replace(old, new)
# Normalize other Unicode characters to their closest ASCII equivalent
text = unicodedata.normalize("NFKD", text)
text = "".join(c for c in text if not unicodedata.combining(c))
return text
[docs]
def normalize_response_text(df: pd.DataFrame) -> pd.DataFrame:
"""
Normalize text columns in a DataFrame.
"""
df = df.map(lambda x: ftfy.fix_text(x) if isinstance(x, str) else x)
# Replace NaN in Student Constructed Response with None.
# Use a fresh object-dtype Series so pandas' str dtype (3.0+) doesn't coerce None back to NaN.
if "Student Constructed Response" in df.columns:
col = df["Student Constructed Response"]
df["Student Constructed Response"] = pd.Series(
[x if pd.notna(x) else None for x in col],
index=df.index,
dtype=object,
)
return df
[docs]
def validate_csv(df: pd.DataFrame) -> None:
required_columns = {"Local Student ID", "Enrolled Grade Level", "Tested Language"}
if not required_columns.issubset(df.columns):
raise ValueError(f"Missing required columns: {required_columns - set(df.columns)}")
[docs]
def read_text_files(folder: Path) -> dict[str, str]:
return {
file.name: normalize_text(file.read_text(encoding="utf-8").strip().replace("\u00a0", " "))
for file in folder.glob("*.txt")
}