The Quick Version#
A chatbot without memory forgets everything after each message. To make conversations feel natural, you need to send prior messages back with each new request. The simplest approach is a message buffer — store all messages and pass them to the LLM every turn.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
| from openai import OpenAI
client = OpenAI()
class Chatbot:
def __init__(self, system_prompt: str = "You are a helpful assistant."):
self.messages = [{"role": "system", "content": system_prompt}]
def chat(self, user_message: str) -> str:
self.messages.append({"role": "user", "content": user_message})
response = client.chat.completions.create(
model="gpt-4o",
messages=self.messages,
)
assistant_message = response.choices[0].message.content
self.messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
bot = Chatbot("You are a Python tutor. Keep answers concise.")
print(bot.chat("What's a list comprehension?"))
print(bot.chat("Give me an example with filtering"))
print(bot.chat("Can I nest them?"))
# The bot remembers the full conversation and builds on previous answers
|
This works until the conversation gets too long and exceeds the model’s context window. For real applications, you need smarter memory strategies.
Sliding Window Memory#
Keep the last N messages and drop older ones. Simple, predictable memory usage.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| class SlidingWindowChat:
def __init__(self, system_prompt: str, max_messages: int = 20):
self.system_prompt = {"role": "system", "content": system_prompt}
self.messages: list[dict] = []
self.max_messages = max_messages
def chat(self, user_message: str) -> str:
self.messages.append({"role": "user", "content": user_message})
# Keep only the last N messages
if len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:]
response = client.chat.completions.create(
model="gpt-4o",
messages=[self.system_prompt] + self.messages,
)
reply = response.choices[0].message.content
self.messages.append({"role": "assistant", "content": reply})
return reply
|
The tradeoff: the bot loses context from early in the conversation. If the user mentioned their name in message #1 and you’re at message #25, it’s gone. For many chat applications, this is fine — recent context matters most.
Summary Memory#
Instead of dropping old messages, summarize them. The bot keeps a running summary of the conversation plus the last few messages for detail.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
| class SummaryMemoryChat:
def __init__(self, system_prompt: str, summary_threshold: int = 10):
self.system_prompt = system_prompt
self.summary = ""
self.recent_messages: list[dict] = []
self.summary_threshold = summary_threshold
def _summarize(self) -> str:
"""Summarize older messages into a compact summary."""
messages_to_summarize = self.recent_messages[:-4] # keep last 4 messages
if not messages_to_summarize:
return self.summary
conversation_text = "\n".join(
f"{m['role']}: {m['content']}" for m in messages_to_summarize
)
prompt = f"""Summarize this conversation concisely, preserving key facts, names, preferences, and decisions:
Previous summary: {self.summary or 'None'}
New messages:
{conversation_text}
Updated summary:"""
response = client.chat.completions.create(
model="gpt-4o-mini", # use a cheap model for summarization
messages=[{"role": "user", "content": prompt}],
max_tokens=300,
)
return response.choices[0].message.content
def chat(self, user_message: str) -> str:
self.recent_messages.append({"role": "user", "content": user_message})
# Summarize when messages pile up
if len(self.recent_messages) > self.summary_threshold:
self.summary = self._summarize()
self.recent_messages = self.recent_messages[-4:]
# Build context with summary + recent messages
system = self.system_prompt
if self.summary:
system += f"\n\nConversation summary so far:\n{self.summary}"
messages = [{"role": "system", "content": system}] + self.recent_messages
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
)
reply = response.choices[0].message.content
self.recent_messages.append({"role": "assistant", "content": reply})
return reply
bot = SummaryMemoryChat("You are a project planning assistant.")
bot.chat("I'm building a REST API for a bookstore")
bot.chat("We need endpoints for books, authors, and reviews")
# ... many turns later, early details live in the summary
|
Summary memory costs one extra LLM call every N messages but preserves key information indefinitely. Use gpt-4o-mini for the summary call to keep costs low.
Persistent Memory with SQLite#
For chatbots that need to remember across sessions (user comes back tomorrow), store messages in a database.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
| import sqlite3
import json
from datetime import datetime
class PersistentChat:
def __init__(self, db_path: str = "chat_memory.db"):
self.conn = sqlite3.connect(db_path)
self.conn.execute("""
CREATE TABLE IF NOT EXISTS messages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT NOT NULL,
role TEXT NOT NULL,
content TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
self.conn.execute("""
CREATE TABLE IF NOT EXISTS session_summaries (
session_id TEXT PRIMARY KEY,
summary TEXT NOT NULL,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
self.conn.commit()
def get_history(self, session_id: str, limit: int = 20) -> list[dict]:
"""Load recent messages for a session."""
rows = self.conn.execute(
"SELECT role, content FROM messages WHERE session_id = ? ORDER BY id DESC LIMIT ?",
(session_id, limit),
).fetchall()
return [{"role": r[0], "content": r[1]} for r in reversed(rows)]
def save_message(self, session_id: str, role: str, content: str):
self.conn.execute(
"INSERT INTO messages (session_id, role, content) VALUES (?, ?, ?)",
(session_id, role, content),
)
self.conn.commit()
def get_summary(self, session_id: str) -> str:
row = self.conn.execute(
"SELECT summary FROM session_summaries WHERE session_id = ?",
(session_id,),
).fetchone()
return row[0] if row else ""
def save_summary(self, session_id: str, summary: str):
self.conn.execute(
"""INSERT INTO session_summaries (session_id, summary, updated_at)
VALUES (?, ?, ?)
ON CONFLICT(session_id) DO UPDATE SET summary=?, updated_at=?""",
(session_id, summary, datetime.now(), summary, datetime.now()),
)
self.conn.commit()
def chat(self, session_id: str, user_message: str) -> str:
self.save_message(session_id, "user", user_message)
history = self.get_history(session_id, limit=20)
summary = self.get_summary(session_id)
system = "You are a helpful assistant."
if summary:
system += f"\n\nPrevious conversation context:\n{summary}"
messages = [{"role": "system", "content": system}] + history
response = client.chat.completions.create(
model="gpt-4o", messages=messages,
)
reply = response.choices[0].message.content
self.save_message(session_id, "assistant", reply)
return reply
# Usage — memory persists across program restarts
chat = PersistentChat()
print(chat.chat("user_123", "My name is Alex and I'm working on a Django project"))
# ... restart program ...
print(chat.chat("user_123", "What was I working on?"))
# "You mentioned you're working on a Django project, Alex."
|
Entity Memory: Remembering Facts About Users#
Extract and store specific facts from conversations for precise recall:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
| class EntityMemory:
def __init__(self):
self.entities: dict[str, dict] = {} # user_id -> {entity: value}
def extract_entities(self, user_id: str, message: str):
"""Use the LLM to extract facts from the message."""
current = json.dumps(self.entities.get(user_id, {}), indent=2)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": f"""Extract key facts from this message. Return JSON with entity names as keys.
Only include clear, stated facts. Keep existing facts unless contradicted.
Current facts: {current}
Message: {message}
Return updated JSON:""",
}],
response_format={"type": "json_object"},
max_tokens=200,
)
try:
new_entities = json.loads(response.choices[0].message.content)
if user_id not in self.entities:
self.entities[user_id] = {}
self.entities[user_id].update(new_entities)
except json.JSONDecodeError:
pass
def get_context(self, user_id: str) -> str:
facts = self.entities.get(user_id, {})
if not facts:
return ""
return "Known facts about the user:\n" + "\n".join(
f"- {k}: {v}" for k, v in facts.items()
)
memory = EntityMemory()
memory.extract_entities("u1", "I'm Alex, I use Python and work at Acme Corp")
print(memory.get_context("u1"))
# Known facts about the user:
# - name: Alex
# - programming_language: Python
# - company: Acme Corp
|
Inject this context into the system prompt so the bot always remembers key facts without replaying the full conversation history.
Common Errors and Fixes#
Bot contradicts itself in long conversations
The sliding window dropped the message where it made a commitment. Switch to summary memory and explicitly instruct the summary to preserve “decisions, commitments, and promises made.”
Context window exceeded error
Count tokens before sending. Use tiktoken to measure total message size and trim if needed. Always reserve tokens for the response (at least 500-1000).
Summary loses important details
Make the summary prompt more specific: “Preserve all names, numbers, dates, code snippets, and technical decisions.” Also increase the summary max_tokens from 300 to 500.
Persistent storage gets too large
Add a TTL (time-to-live) to old sessions. Delete messages older than 30 days, or archive them to cold storage. For active users, periodically summarize and purge raw messages.
Entity extraction hallucinates facts
The LLM sometimes infers facts that weren’t stated. Add “Only extract facts explicitly stated in the message. Do not infer or assume.” to the extraction prompt. Also validate extracted entities against the original message.