import pandas as pd
from .utils import normalize_text
[docs]
def save_results(df, output_path, calculate_totals=True):
if calculate_totals:
# Determine which columns to use for score calculation based on column names
if "idea_development_score" in df.columns and "language_conventions_score" in df.columns:
# Extended format
score_columns = ["idea_development_score", "language_conventions_score"]
# Convert columns to numeric and fill NaN with 0
for col in score_columns:
df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
# Calculate total score
df["total_score"] = df[score_columns].sum(axis=1)
elif "score" in df.columns:
# Item-specific or short format
df["score"] = pd.to_numeric(df["score"], errors="coerce").fillna(0)
df["total_score"] = df["score"]
# Fix encoding for text columns before saving
text_columns = ["feedback", "idea_development_feedback", "language_conventions_feedback"]
for col in text_columns:
if col in df.columns:
df[col] = df[col].apply(normalize_text)
# For all other string columns, also normalize text
for col in df.columns:
if df[col].dtype == "object" and col not in text_columns:
df[col] = df[col].apply(normalize_text)
# Explicitly set encoding to utf-8 when writing to CSV
df.to_csv(output_path, index=False, encoding="utf-8")
[docs]
def merge_csv_files(file_paths, output_path, scoring_format, calculate_totals=True):
"""
Merge multiple CSV files from different passes while preserving pass information.
Uses the total_score from each pass and calculates a new merged total.
Args:
file_paths: List of Paths to CSV files to merge
output_path: Path to save the merged output
scoring_format: The scoring format (extended, item-specific, short)
calculate_totals: Whether to calculate total scores
"""
# Determine which columns to extract based on scoring format
if scoring_format == "extended":
score_columns = [
"idea_development_score",
"idea_development_feedback",
"language_conventions_score",
"language_conventions_feedback",
]
else: # For item-specific or short
score_columns = ["score", "feedback"]
# Function to read CSV with proper handling of encodings
def read_csv_with_encoding(file_path_):
try:
# Try UTF-8 first
df = pd.read_csv(file_path_, encoding="utf-8")
except UnicodeDecodeError:
# If that fails, try with ISO-8859-1 (Latin-1)
df = pd.read_csv(file_path_, encoding="iso-8859-1")
# Apply text normalization to all string columns
for col in df.columns:
if df[col].dtype == "object":
df[col] = df[col].apply(normalize_text)
return df
# Base dataframe - use the first file to get all common columns
base_df = read_csv_with_encoding(file_paths[0])
# Track total score columns for calculation (but won't be kept in final output)
pass_total_scores = []
# Extract pass number from filename
for file_path in file_paths:
file_name = file_path.name
# Extract pass number (assumes format with "_pass_N.csv")
if "_pass_" in file_name:
pass_num = file_name.split("_pass_")[1].split(".")[0]
# Read the file with proper encoding
pass_df = read_csv_with_encoding(file_path)
# Process as before...
if file_path != file_paths[0]:
merge_cols = ["testentryid", "TeacherName"]
temp_cols = merge_cols + [col for col in score_columns if col in pass_df.columns]
temp_df = pass_df[temp_cols].copy()
rename_dict = {}
for col in score_columns:
if col in pass_df.columns:
rename_dict[col] = f"{col}_pass{pass_num}"
temp_df = temp_df.rename(columns=rename_dict)
base_df = pd.merge(base_df, temp_df, on=merge_cols, how="left")
if "total_score" in pass_df.columns:
pass_total_scores.append(pd.to_numeric(pass_df["total_score"], errors="coerce").fillna(0))
else:
rename_dict = {}
for col in score_columns:
if col in pass_df.columns:
rename_dict[col] = f"{col}_pass{pass_num}"
base_df = base_df.rename(columns=rename_dict)
if "total_score" in pass_df.columns:
pass_total_scores.append(pd.to_numeric(pass_df["total_score"], errors="coerce").fillna(0))
# Calculate merged total score based on individual pass total scores
if calculate_totals and pass_total_scores:
total_scores_df = pd.DataFrame(index=base_df.index)
for i, scores in enumerate(pass_total_scores):
if len(scores) == len(base_df):
total_scores_df[f"pass_{i + 1}"] = scores.values
base_df["total_score"] = total_scores_df.sum(axis=1)
# Normalize all text columns before saving
for col in base_df.columns:
if base_df[col].dtype == "object":
base_df[col] = base_df[col].apply(normalize_text)
# Reorder columns to place total_score after the "Tested Language" column
if "total_score" in base_df.columns:
# Get a list of all columns
cols = list(base_df.columns)
# Remove total_score from current position
cols.remove("total_score")
# Try to find "Tested Language" column
if "Tested Language" in cols:
# Insert after Tested Language column
tested_lang_pos = cols.index("Tested Language")
cols.insert(tested_lang_pos + 1, "total_score")
else:
# Fall back to the previous approach - after ID columns
id_cols = []
for col in cols:
if (
col.lower() in ["testentryid", "id", "student_id", "local student id"]
or "name" in col.lower()
or col == "TeacherName"
):
id_cols.append(col)
# Find the position after the last ID column
insertion_point = 0
for col in id_cols:
pos = cols.index(col) + 1
insertion_point = max(insertion_point, pos)
# Insert total_score at the determined position
cols.insert(insertion_point, "total_score")
# Reorder the DataFrame
base_df = base_df[cols]
# Save the merged dataframe with explicit utf-8 encoding
base_df.to_csv(output_path, index=False, encoding="utf-8")
return base_df