def profile(df):
return pd.DataFrame({
"dtype": df.dtypes.astype(str),
"missing": df.isna().sum(),
"missing_pct": (df.isna().mean()*100).round(1),
"distinct": df.nunique(),
})
print(profile(emp))
issues = []
# Stray whitespace
for col in ["first_name", "last_name", "email"]:
bad = emp[col].astype(str).str.strip() != emp[col].astype(str)
issues += [(rowid, col, "whitespace") for rowid in emp.index[bad]]
# Mixed case emails
bad = emp["email"].astype(str) != emp["email"].astype(str).str.lower()
issues += [(r, "email", "mixed case") for r in emp.index[bad]]
# Missing manager
issues += [(r, "manager_id", "missing") for r in emp.index[emp["manager_id"].isna()]]
# Manager who doesn't exist
mgr_ids = set(emp["employee_id"])
bad = ~emp["manager_id"].isin(mgr_ids) & emp["manager_id"].notna()
issues += [(r, "manager_id", "non-existent") for r in emp.index[bad]]
# Duplicate IDs
dup = emp[emp.duplicated("employee_id", keep=False)]
issues += [(r, "employee_id", "duplicate") for r in dup.index]
fix = pd.DataFrame(issues, columns=["row_index", "column", "issue"]).merge(
emp[["employee_id", "first_name", "last_name"]],
left_on="row_index", right_index=True
)
fix.to_excel("hris_fix_list.xlsx", index=False)
print(f"⚠️ {len(fix)} issues to fix")
Add a check: flag emails that don't match a basic name@domain.tld pattern. Hint: str.match(r"^[^@\s]+@[^@\s]+\.[^@\s]+$").