import pandas as pd
new_list = pd.read_csv("conference_list.csv") # 50,000 rows
existing = pd.read_csv("crm_export.csv") # 220,000 rows
def norm_email(s):
return str(s).strip().lower()
def norm_company(s):
s = str(s).strip().lower()
for suffix in [" inc", " inc.", " llc", " ltd", " ltd.", " gmbh", " co", " co."]:
if s.endswith(suffix): s = s[:-len(suffix)]
return s.strip()
for df in (new_list, existing):
df["email_n"] = df["email"].apply(norm_email)
df["company_n"] = df["company"].apply(norm_company)
known_emails = set(existing["email_n"])
known_companies = set(existing["company_n"])
new_list["match_by_email"] = new_list["email_n"].isin(known_emails)
new_list["match_by_company"] = new_list["company_n"].isin(known_companies)
new_list["already_known"] = new_list["match_by_email"] | new_list["match_by_company"]
net_new = new_list[~new_list["already_known"]].drop(columns=["match_by_email","match_by_company"])
print(f"Net-new leads: {len(net_new):,} of {len(new_list):,}")
net_new.to_excel("net_new_leads.xlsx", index=False)
net_new = net_new.sort_values("score", ascending=False) # keep highest-scoring dupe
net_new = net_new.drop_duplicates(subset="email_n")
net_new = net_new.drop_duplicates(subset=["company_n", "last_name"])
Add a third match: if the email domain (after @) matches a known company, treat it as a likely dupe.