Skip to content

NLP Collection

NLP multi-label binarization functions. Converts text columns containing delimited labels into binary occurrence matrices. Supports automatic language detection, lemmatization, accent removal, and fuzzy matching to user-provided hint labels.

nlp

NLP collection -- text label binarization functions.

Provides unsupervised and supervised multi-label binarization for text columns. Converts comma/semicolon-separated label strings into long-format occurrence tables with a value column, using automatic language detection, lemmatization, and fuzzy matching.

nlp_binarize_labels_auto

nlp_binarize_labels_auto(df, column, max_labels=30)

Unsupervised multi-label binarizer.

Pipeline: language detection → delimiter detection → lemmatization → accent removal → binary matrix generation → top-N filtering.

PARAMETER DESCRIPTION
df

Input DataFrame.

TYPE: DataFrame

column

Column containing delimited label strings (e.g. "pain, headache, nausea").

TYPE: str

max_labels

Maximum number of label columns to keep (most frequent).

TYPE: int DEFAULT: 30

RETURNS DESCRIPTION
DataFrame

Long-format DataFrame with the original columns where column now

DataFrame

contains one detected label per row, plus a value column (0 or 1).

Source code in src/choregraph/collection/nlp.py
def nlp_binarize_labels_auto(df: pd.DataFrame, column: str, max_labels: int = 30) -> pd.DataFrame:
    """Unsupervised multi-label binarizer.

    Pipeline: language detection → delimiter detection → lemmatization →
    accent removal → binary matrix generation → top-N filtering.

    Args:
        df: Input DataFrame.
        column: Column containing delimited label strings
            (e.g. ``"pain, headache, nausea"``).
        max_labels: Maximum number of label columns to keep (most frequent).

    Returns:
        Long-format DataFrame with the original columns where *column* now
        contains one detected label per row, plus a ``value`` column (0 or 1).
    """
    from choregraph._extras import optional_dep
    with optional_dep():
        from unidecode import unidecode
        import simplemma
        from langdetect import detect, DetectorFactory
    DetectorFactory.seed = 0

    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    series = df[column]
    # --- 1. LANGUAGE DETECTION ---
    # Sample some text for language detection
    sample_text = " ".join(series.dropna().astype(str).head(50))
    if not sample_text.strip():
        # Fallback to English if no text is found
        detected_lang = 'en'
    else:
        try:
            detected_lang = detect(sample_text)
        except Exception:
            # Fallback to English if detection fails
            detected_lang = 'en'
    # --- 2. DELIMITER DETECTION ---
    all_raw_text = "".join(series.dropna().astype(str).head(1000))
    delimiters = re.findall(r'[,;|]', all_raw_text)
    sep = Counter(delimiters).most_common(1)[0][0] if delimiters else None
    # --- 3. EXTRACTION AND LEMMATIZATION ---
    if sep:
        raw_tokens = series.dropna().astype(str).str.split(rf'\s*\{sep}\s*').explode()
    else:
        raw_tokens = series.dropna().astype(str)
    unique_raw_tokens = sorted(list(set(raw_tokens.str.strip())))
    mapping = {}
    for original in unique_raw_tokens:
        if not original:
            continue
        # Lowercase and strip
        word_for_lemma = original.lower().strip()
        # Lemmatize (e.g., shoulders -> shoulder)
        try:
            lemma = simplemma.lemmatize(word_for_lemma, lang=detected_lang)
        except Exception:
            # If language not supported by simplemma, use the word as is
            lemma = word_for_lemma
        # Remove accents and clean
        final_label = unidecode(lemma).replace(',', '.')
        mapping[original] = final_label
    # --- 4. RECONSTRUCTION ---
    def apply_mapping(text):
        if pd.isna(text) or str(text).lower() == 'nan' or not str(text).strip():
            return []
        tokens = [t.strip() for t in (str(text).split(sep) if sep else [text])]
        return list({mapping.get(t, t) for t in tokens if t})
    clean_lists = series.apply(apply_mapping)
    # --- 5. BINARY MATRIX GENERATION ---
    exploded = clean_lists.explode()
    if exploded.empty:
        return pd.DataFrame(index=df.index)
    matrix = pd.get_dummies(exploded).groupby(level=0).sum()
    # Ensure it aligns with original index (in case of empty or dropped rows)
    matrix = matrix.reindex(df.index).fillna(0).astype(int)
    # --- 6. LIMIT COLUMNS TO TOP N ---
    if len(matrix.columns) > max_labels:
        top_cols = matrix.sum().nlargest(max_labels).index
        matrix = matrix[top_cols]
    # --- 7. LONGIFY ---
    df_base = df.drop(columns=[column])
    df_wide = pd.concat([df_base, matrix], axis=1)
    label_cols = list(matrix.columns)
    id_vars = [c for c in df_wide.columns if c not in label_cols]
    df_long = df_wide.melt(id_vars=id_vars, value_vars=label_cols,
                           var_name=column, value_name="value")
    df_long["value"] = df_long["value"].fillna(0).astype(int)
    return df_long

nlp_binarize_labels_hinted

nlp_binarize_labels_hinted(df, column, hints, max_labels=30)

Supervised multi-label binarizer using hint labels.

Uses fuzzy string matching (Levenshtein distance, score cutoff 80) to map extracted tokens to the provided hint list, correcting typos and variations.

PARAMETER DESCRIPTION
df

Input DataFrame.

TYPE: DataFrame

column

Column containing delimited label strings.

TYPE: str

hints

List of canonical label names to match against.

TYPE: List[str]

max_labels

Maximum number of label columns to keep.

TYPE: int DEFAULT: 30

RETURNS DESCRIPTION
DataFrame

Long-format DataFrame with the original columns where column now

DataFrame

contains one matched hint label per row, plus a value column (0 or 1).

Source code in src/choregraph/collection/nlp.py
def nlp_binarize_labels_hinted(df: pd.DataFrame, column: str, hints: List[str], max_labels: int = 30) -> pd.DataFrame:
    """Supervised multi-label binarizer using hint labels.

    Uses fuzzy string matching (Levenshtein distance, score cutoff 80) to map
    extracted tokens to the provided hint list, correcting typos and variations.

    Args:
        df: Input DataFrame.
        column: Column containing delimited label strings.
        hints: List of canonical label names to match against.
        max_labels: Maximum number of label columns to keep.

    Returns:
        Long-format DataFrame with the original columns where *column* now
        contains one matched hint label per row, plus a ``value`` column (0 or 1).
    """
    from choregraph._extras import optional_dep
    with optional_dep():
        from unidecode import unidecode
        from rapidfuzz import process

    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    if not hints:
        return pd.DataFrame(index=df.index)
    series = df[column]
    # 1. DELIMITER DETECTION
    # Basic cleaning for delimiter detection
    clean_sample = series.dropna().astype(str).head(1000).apply(lambda x: unidecode(x).lower().strip())
    all_raw_text = "".join(clean_sample.values)
    delimiters = re.findall(r'[,;|]', all_raw_text)
    sep = Counter(delimiters).most_common(1)[0][0] if delimiters else None
    # 2. TOKEN EXTRACTION
    if sep:
        raw_tokens = series.dropna().astype(str).str.split(rf'\s*\{sep}\s*').explode()
    else:
        raw_tokens = series.dropna().astype(str)
    unique_tokens = sorted(list(set(raw_tokens.str.strip())))
    # 3. FUZZY MATCHING TO HINTS
    mapping = {}
    clean_hints = [unidecode(h).lower().strip() for h in hints]
    for token in unique_tokens:
        if not token:
            continue
        token_clean = unidecode(token).lower().strip()
        # Levenshtein distance matching (score_cutoff=80)
        match = process.extractOne(token_clean, clean_hints, score_cutoff=80)
        if match:
            # match[2] is the index of the detected hint
            mapping[token] = hints[match[2]]
        else:
            # If no match, keep the token (unidecoded and cleaned)
            mapping[token] = token_clean
    # 4. RECONSTRUCTION
    def apply_mapping(text):
        if pd.isna(text) or str(text).lower() == 'nan' or not str(text).strip():
            return []
        tokens = [t.strip() for t in (str(text).split(sep) if sep else [text])]
        return list({mapping.get(t, unidecode(t).lower().strip()) for t in tokens if t})
    clean_lists = series.apply(apply_mapping)
    # 5. BINARY MATRIX GENERATION
    exploded = clean_lists.explode()
    if exploded.empty:
        return pd.DataFrame(index=df.index)
    matrix = pd.get_dummies(exploded).groupby(level=0).sum()
    # Ensure it aligns with original index
    matrix = matrix.reindex(df.index).fillna(0).astype(int)
    # 6. LIMIT COLUMNS TO TOP N
    if len(matrix.columns) > max_labels:
        top_cols = matrix.sum().nlargest(max_labels).index
        matrix = matrix[top_cols]
    # --- 7. LONGIFY ---
    df_base = df.drop(columns=[column])
    df_wide = pd.concat([df_base, matrix], axis=1)
    label_cols = list(matrix.columns)
    id_vars = [c for c in df_wide.columns if c not in label_cols]
    df_long = df_wide.melt(id_vars=id_vars, value_vars=label_cols,
                           var_name=column, value_name="value")
    df_long["value"] = df_long["value"].fillna(0).astype(int)
    return df_long