Skip to content

๐Ÿงช Example: Processing CoNLL NER Data with Chisel

This example demonstrates how to parse the CoNLL-2003 dataset into Chisel's internal ChiselRecord format, suitable for training transformer-based token classification models.

๐Ÿ“ฅ Step 1: Download CoNLL Data

We use the version hosted by the CrossWeigh repository.

import requests

url = "https://raw.githubusercontent.com/ZihanWangKi/CrossWeigh/refs/heads/master/data/conllpp_train.txt"
response = requests.get(url)
docs = response.text.split("-DOCSTART- -X- -X- O\n\n")
docs = list(filter(lambda x: len(x) > 0, docs))

๐Ÿงฉ Step 2: Define a Parser for CoNLL Format

Note: Whilst this may seem a bit complicated at first glance, with help of generative AI and the validators chisel provide, it should be fairly quick to write own custom parsers.

from typing import Tuple, List
from chisel.extraction.base.protocols import Parser
from chisel.extraction.models.models import EntitySpan
import string

class ConllParser(Parser):
    def parse(self, doc: str) -> Tuple[str, List[EntitySpan]]:
        tokens, labels = [], []
        for line in doc.strip().splitlines():
            if not line.strip():
                continue
            splits = line.strip().split(" ")
            tokens.append(splits[0])
            labels.append(splits[-1])

        text = ""
        spans = []
        char_offset = 0
        i = 0

        while i < len(tokens):
            token = tokens[i]
            label = labels[i]

            if text and token not in string.punctuation:
                text += " "
                char_offset += 1

            if label.startswith("B-"):
                ent_label = label[2:]
                ent_start = char_offset
                ent_text = token
                text += token
                char_offset += len(token)
                i += 1
                while i < len(tokens) and labels[i].startswith("I-"):
                    text += " " + tokens[i]
                    ent_text += " " + tokens[i]
                    char_offset += 1 + len(tokens[i])
                    i += 1
                ent_end = char_offset
                spans.append(EntitySpan(text=ent_text, start=ent_start, end=ent_end, label=ent_label))
            else:
                text += token
                char_offset += len(token)
                i += 1

        return text.strip(), spans

๐Ÿ”ง Step 3: Initialize Chisel Components

from transformers import AutoTokenizer
from chisel.extraction.tokenizers.hf_tokenizer import HFTokenizer
from chisel.extraction.span_aligners.token_span_aligner import TokenSpanAligner
from chisel.extraction.labelers.bio_labeler import BIOLabeler
from chisel.extraction.labelers.label_encoder import SimpleLabelEncoder
from chisel.extraction.validators.validators import DefaultParseValidator, HFTokenAlignmentValidator
from chisel.extraction.formatters.hf_formatter import HFDatasetFormatter
from chisel.extraction.models.models import ChiselRecord

๐Ÿ“ฆ Components

parser = ConllParser()
tokenizer = HFTokenizer(model_name="bert-base-cased")
aligner = TokenSpanAligner()
labeler = BIOLabeler()
label_encoder = SimpleLabelEncoder(label_to_id={
    'O': 0,
    'B-ORG': 1,
    'I-ORG': 2,
    'B-PER': 3,
    'I-PER': 4,
    'B-MISC': 5,
    'I-MISC': 6,
    'B-LOC': 7,
    'I-LOC': 8
})

parse_validators = [DefaultParseValidator(on_error="raise")]
label_validators = [HFTokenAlignmentValidator(tokenizer=tokenizer.tokenizer, on_error="raise")]
formatter = HFDatasetFormatter()

๐Ÿ”„ Step 4: Run the Pipeline

processed_data = []

# ๐Ÿ” Pipeline loop
for idx, example in enumerate(docs):
    text, entities = parser.parse(example)

    # ๐Ÿงช Per-span validation โ€” skip bad spans
    valid_spans = []
    for span in entities:
        try:
            for validator in parse_validators:
                validator.validate(text, span)
            valid_spans.append(span)
        except ValueError:
            continue 

    tokens = tokenizer.tokenize(text)
    token_entity_spans = aligner.align(entities, tokens)

    labels = labeler.label(tokens, token_entity_spans)
    encoded_labels = label_encoder.encode(labels)

    # ๐Ÿงช Per-span validation โ€” skip bad spans
    valid_token_spans = []
    for span in token_entity_spans:
        try:
            for validator in label_validators:
                validator.validate(tokens, span)
            valid_token_spans.append(span)
        except ValueError:
            continue  # Optionally log or collect stats on dropped spans

    record = ChiselRecord(
                id=str(idx),
                chunk_id=0,
                text=tokenizer.tokenizer.decode([token.id for token in tokens]),
                tokens=tokens,
                input_ids=[token.id for token in tokens],
                attention_mask=[1] * len(tokens),
                entities=[tes.entity for tes in valid_token_spans],
                bio_labels=labels,
                labels=encoded_labels
            )
    processed_data.append(record)

data = formatters.format(processed_data)

โœ… Output

You now have a HuggingFace dataset ready!