Source code for ai_essay_evaluator.evaluator.file_handler

import pandas as pd

from .utils import normalize_text



[docs]
def save_results(df, output_path, calculate_totals=True):
    if calculate_totals:
        # Determine which columns to use for score calculation based on column names
        if "idea_development_score" in df.columns and "language_conventions_score" in df.columns:
            # Extended format
            score_columns = ["idea_development_score", "language_conventions_score"]
            # Convert columns to numeric and fill NaN with 0
            for col in score_columns:
                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
            # Calculate total score
            df["total_score"] = df[score_columns].sum(axis=1)
        elif "score" in df.columns:
            # Item-specific or short format
            df["score"] = pd.to_numeric(df["score"], errors="coerce").fillna(0)
            df["total_score"] = df["score"]

    # Fix encoding for text columns before saving
    text_columns = ["feedback", "idea_development_feedback", "language_conventions_feedback"]
    for col in text_columns:
        if col in df.columns:
            df[col] = df[col].apply(normalize_text)

    # For all other string columns, also normalize text
    for col in df.columns:
        if df[col].dtype == "object" and col not in text_columns:
            df[col] = df[col].apply(normalize_text)

    # Explicitly set encoding to utf-8 when writing to CSV
    df.to_csv(output_path, index=False, encoding="utf-8")




[docs]
def merge_csv_files(file_paths, output_path, scoring_format, calculate_totals=True):
    """
    Merge multiple CSV files from different passes while preserving pass information.
    Uses the total_score from each pass and calculates a new merged total.

    Args:
        file_paths: List of Paths to CSV files to merge
        output_path: Path to save the merged output
        scoring_format: The scoring format (extended, item-specific, short)
        calculate_totals: Whether to calculate total scores
    """
    # Determine which columns to extract based on scoring format
    if scoring_format == "extended":
        score_columns = [
            "idea_development_score",
            "idea_development_feedback",
            "language_conventions_score",
            "language_conventions_feedback",
        ]
    else:  # For item-specific or short
        score_columns = ["score", "feedback"]

    # Function to read CSV with proper handling of encodings
    def read_csv_with_encoding(file_path_):
        try:
            # Try UTF-8 first
            df = pd.read_csv(file_path_, encoding="utf-8")
        except UnicodeDecodeError:
            # If that fails, try with ISO-8859-1 (Latin-1)
            df = pd.read_csv(file_path_, encoding="iso-8859-1")

        # Apply text normalization to all string columns
        for col in df.columns:
            if df[col].dtype == "object":
                df[col] = df[col].apply(normalize_text)

        return df

    # Base dataframe - use the first file to get all common columns
    base_df = read_csv_with_encoding(file_paths[0])

    # Track total score columns for calculation (but won't be kept in final output)
    pass_total_scores = []

    # Extract pass number from filename
    for file_path in file_paths:
        file_name = file_path.name
        # Extract pass number (assumes format with "_pass_N.csv")
        if "_pass_" in file_name:
            pass_num = file_name.split("_pass_")[1].split(".")[0]

            # Read the file with proper encoding
            pass_df = read_csv_with_encoding(file_path)

            # Process as before...
            if file_path != file_paths[0]:
                merge_cols = ["testentryid", "TeacherName"]
                temp_cols = merge_cols + [col for col in score_columns if col in pass_df.columns]
                temp_df = pass_df[temp_cols].copy()

                rename_dict = {}
                for col in score_columns:
                    if col in pass_df.columns:
                        rename_dict[col] = f"{col}_pass{pass_num}"

                temp_df = temp_df.rename(columns=rename_dict)
                base_df = pd.merge(base_df, temp_df, on=merge_cols, how="left")

                if "total_score" in pass_df.columns:
                    pass_total_scores.append(pd.to_numeric(pass_df["total_score"], errors="coerce").fillna(0))
            else:
                rename_dict = {}
                for col in score_columns:
                    if col in pass_df.columns:
                        rename_dict[col] = f"{col}_pass{pass_num}"

                base_df = base_df.rename(columns=rename_dict)

                if "total_score" in pass_df.columns:
                    pass_total_scores.append(pd.to_numeric(pass_df["total_score"], errors="coerce").fillna(0))

    # Calculate merged total score based on individual pass total scores
    if calculate_totals and pass_total_scores:
        total_scores_df = pd.DataFrame(index=base_df.index)

        for i, scores in enumerate(pass_total_scores):
            if len(scores) == len(base_df):
                total_scores_df[f"pass_{i + 1}"] = scores.values

        base_df["total_score"] = total_scores_df.sum(axis=1)

    # Normalize all text columns before saving
    for col in base_df.columns:
        if base_df[col].dtype == "object":
            base_df[col] = base_df[col].apply(normalize_text)

    # Reorder columns to place total_score after the "Tested Language" column
    if "total_score" in base_df.columns:
        # Get a list of all columns
        cols = list(base_df.columns)

        # Remove total_score from current position
        cols.remove("total_score")

        # Try to find "Tested Language" column
        if "Tested Language" in cols:
            # Insert after Tested Language column
            tested_lang_pos = cols.index("Tested Language")
            cols.insert(tested_lang_pos + 1, "total_score")
        else:
            # Fall back to the previous approach - after ID columns
            id_cols = []
            for col in cols:
                if (
                    col.lower() in ["testentryid", "id", "student_id", "local student id"]
                    or "name" in col.lower()
                    or col == "TeacherName"
                ):
                    id_cols.append(col)

            # Find the position after the last ID column
            insertion_point = 0
            for col in id_cols:
                pos = cols.index(col) + 1
                insertion_point = max(insertion_point, pos)

            # Insert total_score at the determined position
            cols.insert(insertion_point, "total_score")

        # Reorder the DataFrame
        base_df = base_df[cols]

    # Save the merged dataframe with explicit utf-8 encoding
    base_df.to_csv(output_path, index=False, encoding="utf-8")

    return base_df