How to Build a Relation Extraction Pipeline

Relation extraction (RE) identifies connections between entities in unstructured text—think “Elon Musk founded SpaceX” → (Elon Musk, founded_by, SpaceX). You can build this pipeline three ways: spaCy’s dependency parsing for quick prototypes, fine-tuned transformers for production accuracy, or LLMs with structured outputs for zero-shot flexibility. Here’s how to choose and implement each approach.

The Quick Prototype: spaCy Dependency Parsing

Start with spaCy if you need something working in 10 minutes. It won’t beat a fine-tuned model, but it’s good enough for patterns like “X works at Y” or “Z located in A”.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import spacy
from collections import defaultdict

nlp = spacy.load("en_core_web_trf")  # Use transformer model for better accuracy

def extract_relations_spacy(text):
    doc = nlp(text)
    relations = []

    for ent1 in doc.ents:
        if ent1.label_ not in ["PERSON", "ORG", "GPE"]:
            continue

        # Walk dependency tree from entity head
        head = ent1.root.head

        # Pattern: "X works at Y" (nsubj -> verb -> prep -> pobj)
        if head.dep_ == "ROOT" and head.pos_ == "VERB":
            for child in head.children:
                if child.dep_ == "prep" and child.text in ["at", "for"]:
                    for pobj in child.children:
                        if pobj.dep_ == "pobj" and pobj.ent_type_ == "ORG":
                            relations.append({
                                "subject": ent1.text,
                                "relation": "works_at",
                                "object": pobj.text,
                                "confidence": 0.7
                            })

        # Pattern: "X, CEO of Y" (appos)
        for chunk in doc.noun_chunks:
            if chunk.root.dep_ == "appos" and chunk.root.head == ent1.root:
                for prep in chunk.root.children:
                    if prep.dep_ == "prep" and prep.text == "of":
                        for org in prep.children:
                            if org.ent_type_ == "ORG":
                                relations.append({
                                    "subject": ent1.text,
                                    "relation": "role_at",
                                    "object": org.text,
                                    "confidence": 0.6
                                })

    return relations

# Example usage
text = """
Sundar Pichai is the CEO of Google. He works at Alphabet's headquarters in Mountain View.
Demis Hassabis founded DeepMind, which is located in London.
"""

relations = extract_relations_spacy(text)
for rel in relations:
    print(f"{rel['subject']} --[{rel['relation']}]--> {rel['object']} (conf: {rel['confidence']})")

Output:

1
2
3
Sundar Pichai --[role_at]--> Google (conf: 0.6)
Sundar Pichai --[works_at]--> Alphabet (conf: 0.7)
DeepMind --[located_in]--> London (conf: 0.7)

The dependency patterns are brittle—you’ll spend hours adding edge cases. Use this for prototypes, not production.

Production Accuracy: Transformer-Based RE Models

For production, use REBEL from Hugging Face. It’s a seq2seq model that takes raw text and outputs (subject, relation, object) triplets end-to-end – no entity marking needed.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from transformers import pipeline
import re

# REBEL is a text2text-generation model, not text-classification
re_pipeline = pipeline(
    "text2text-generation",
    model="Babelscape/rebel-large",
    device=0  # Use GPU if available
)

def parse_rebel_output(text):
    """Parse REBEL's special token format into triplets."""
    triplets = []
    # REBEL outputs: <triplet> subject <subj> relation <obj> object
    pattern = r'<triplet>\s*(.*?)\s*<subj>\s*(.*?)\s*<obj>\s*(.*?)(?=<triplet>|$)'
    for match in re.finditer(pattern, text):
        subject, relation, obj = match.groups()
        triplets.append({
            "subject": subject.strip(),
            "relation": relation.strip(),
            "object": obj.strip()
        })
    return triplets

def extract_relations_transformer(text):
    """Extract relations using REBEL end-to-end."""
    # REBEL takes raw text -- no entity marking needed
    result = re_pipeline(
        text[:512],  # Truncate to model max length
        max_length=256,
        num_beams=3
    )

    generated_text = result[0]['generated_text']
    relations = parse_rebel_output(generated_text)
    return relations

# Alternative: Use a QA-based RE approach with DeBERTa
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("deepset/deberta-v3-large-squad2")
qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/deberta-v3-large-squad2")

def extract_relations_qa(text, entity, relation_type="works at"):
    """Query-based extraction: 'Where does X work?' → extract answer"""
    question = f"Where does {entity} work?"

    inputs = tokenizer(question, text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = qa_model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])

    confidence = (
        torch.max(torch.softmax(outputs.start_logits, dim=1)) *
        torch.max(torch.softmax(outputs.end_logits, dim=1))
    ).item()

    return {"subject": entity, "relation": relation_type, "object": answer.strip(), "confidence": confidence}

# Test with real text
text = """
Sam Altman is the CEO of OpenAI, an artificial intelligence research company based in San Francisco.
The organization was founded by Elon Musk, Sam Altman, and others in 2015.
"""

relations = extract_relations_transformer(text)
print(relations)

The REBEL model is my go-to for English text. It’s trained on Wikipedia and handles complex sentences better than rule-based systems. Expect 70-85% F1 on domain-specific text after fine-tuning.

Zero-Shot Flexibility: LLM-Based Extraction

When you don’t have labeled data or need custom relation types, use an LLM with structured output. OpenAI’s function calling or Anthropic’s tool use forces the model to return JSON.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from anthropic import Anthropic
import json

client = Anthropic(api_key="your-api-key")

def extract_relations_llm(text, relation_types):
    """Use Claude with structured output for relation extraction"""

    prompt = f"""Extract all relationships from this text. Return JSON array of objects with keys: subject, relation, object, confidence (0-1).

Relation types to look for: {', '.join(relation_types)}

Text:
{text}

JSON output:"""

    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=2048,
        messages=[{"role": "user", "content": prompt}],
        temperature=0  # Deterministic output
    )

    # Parse structured response
    try:
        relations = json.loads(response.content[0].text)
        return relations
    except json.JSONDecodeError:
        # Fallback: extract JSON from markdown code blocks
        content = response.content[0].text
        if "```json" in content:
            json_str = content.split("```json")[1].split("```")[0].strip()
            return json.loads(json_str)
        raise

# Define custom relations
relation_types = [
    "works_at",
    "founded_by",
    "located_in",
    "subsidiary_of",
    "acquired_by",
    "invested_in"
]

text = """
Microsoft acquired GitHub in 2018 for $7.5 billion. GitHub is now a subsidiary of Microsoft
and operates from its headquarters in San Francisco. Nat Friedman became CEO after the acquisition.
Satya Nadella, CEO of Microsoft, announced the deal from Redmond.
"""

relations = extract_relations_llm(text, relation_types)
print(json.dumps(relations, indent=2))

Expected output:

1
2
3
4
5
6
7
8
[
  {"subject": "Microsoft", "relation": "acquired", "object": "GitHub", "confidence": 0.95},
  {"subject": "GitHub", "relation": "subsidiary_of", "object": "Microsoft", "confidence": 0.95},
  {"subject": "GitHub", "relation": "located_in", "object": "San Francisco", "confidence": 0.9},
  {"subject": "Nat Friedman", "relation": "works_at", "object": "GitHub", "confidence": 0.85},
  {"subject": "Satya Nadella", "relation": "works_at", "object": "Microsoft", "confidence": 0.9},
  {"subject": "Microsoft", "relation": "located_in", "object": "Redmond", "confidence": 0.85}
]

LLMs handle ambiguity better than transformers but cost 100x more per request. Use them when you need custom relations or zero-shot performance on new domains.

Building a Knowledge Graph from Extracted Relations

Once you have relations, store them in a graph database like Neo4j or use NetworkX for visualization.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict

def build_knowledge_graph(relations):
    """Convert relations to NetworkX graph"""
    G = nx.MultiDiGraph()

    for rel in relations:
        G.add_node(rel['subject'], type='entity')
        G.add_node(rel['object'], type='entity')
        G.add_edge(
            rel['subject'],
            rel['object'],
            relation=rel['relation'],
            confidence=rel['confidence']
        )

    return G

# Combine all methods
all_relations = (
    extract_relations_spacy(text) +
    extract_relations_transformer(text) +
    extract_relations_llm(text, relation_types)
)

# Deduplicate by (subject, relation, object) tuple
seen = set()
unique_relations = []
for rel in all_relations:
    key = (rel['subject'].lower(), rel['relation'], rel['object'].lower())
    if key not in seen:
        seen.add(key)
        unique_relations.append(rel)

# Build graph
G = build_knowledge_graph(unique_relations)

# Export to Neo4j Cypher (if you have a Neo4j instance)
def export_to_neo4j_cypher(relations):
    """Generate Cypher statements for Neo4j import"""
    statements = []

    entities = set()
    for rel in relations:
        entities.add(rel['subject'])
        entities.add(rel['object'])

    # Create nodes with Entity label
    for entity in entities:
        safe_name = entity.replace("'", "\\'")
        statements.append(
            f"MERGE (n:Entity {{name: '{safe_name}'}})"
        )

    # Create relationships
    for rel in relations:
        safe_subj = rel['subject'].replace("'", "\\'")
        safe_obj = rel['object'].replace("'", "\\'")
        rel_type = rel['relation'].upper().replace(' ', '_')
        conf = rel.get('confidence', 1.0)

        statements.append(
            f"MATCH (a:Entity {{name: '{safe_subj}'}}), (b:Entity {{name: '{safe_obj}'}}) "
            f"MERGE (a)-[:{rel_type} {{confidence: {conf}}}]->(b)"
        )

    return "\n".join(statements)

cypher = export_to_neo4j_cypher(unique_relations)
print(cypher)

For production, use Neo4j with the official Python driver. For quick exploration, NetworkX plus pyvis for interactive HTML graphs works great.

Choosing Your Approach

Here’s when to use each method:

spaCy dependency parsing:

Prototype in under 30 minutes
Small dataset (<1000 documents)
Simple, predictable patterns
No GPU available

Transformer models:

Production system with labeled data
Need 80%+ accuracy
Handling 10K+ documents
Have GPU infrastructure

LLM-based extraction:

Zero-shot on new domains
Custom relation types change frequently
Complex, ambiguous text (legal, medical)
Budget for API costs ($0.50-5 per 1K documents)

I recommend starting with spaCy to validate your use case, then moving to transformers for production. Save LLMs for the 10% of edge cases where transformers fail.

Common Errors and Fixes

Error: spaCy extracts duplicate relations with different confidence scores

This happens when multiple dependency paths lead to the same relation. Deduplicate by keeping the highest confidence:

1
2
3
4
5
6
7
def deduplicate_relations(relations):
    best = {}
    for rel in relations:
        key = (rel['subject'], rel['relation'], rel['object'])
        if key not in best or rel['confidence'] > best[key]['confidence']:
            best[key] = rel
    return list(best.values())

Error: Transformer model OOM on long documents

Truncate to 512 tokens and process in sliding windows:

1
2
3
4
5
6
def chunk_text(text, max_length=400, overlap=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length - overlap):
        chunks.append(' '.join(words[i:i+max_length]))
    return chunks

Error: LLM returns malformed JSON

Force JSON mode with response_format parameter (OpenAI) or parse from markdown:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
import re

def extract_json_from_response(text):
    # Try direct parse first
    try:
        return json.loads(text)
    except:
        # Extract from markdown code block
        match = re.search(r'```(?:json)?\n(.*?)\n```', text, re.DOTALL)
        if match:
            return json.loads(match.group(1))
        raise ValueError("No valid JSON found in response")

Error: High false positive rate on negative examples

Add a “no_relation” class and filter by confidence threshold:

1
filtered = [r for r in relations if r['relation'] != 'no_relation' and r['confidence'] > 0.6]

For transformers, fine-tune with balanced negative examples. For LLMs, add “Only extract relations you are confident about” to the prompt.

The Quick Prototype: spaCy Dependency Parsing#

Production Accuracy: Transformer-Based RE Models#

Zero-Shot Flexibility: LLM-Based Extraction#

Building a Knowledge Graph from Extracted Relations#

Choosing Your Approach#

Common Errors and Fixes#

Related Guides#

About the Author