Skip to content

Commit bd0581c

Browse files
authored
Merge pull request #174 from mongodb-developer/voice-memory-wip
Voice memory wip
2 parents 1f2d3dc + c518281 commit bd0581c

28 files changed

+9787
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Google Gemini API Key
2+
# Get yours at: https://aistudio.google.com/apikey
3+
GOOGLE_API_KEY=your_gemini_api_key_here
4+
5+
# MongoDB Connection String
6+
# Get yours at: https://cloud.mongodb.com
7+
MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/?retryWrites=true&w=majority
8+
9+
# MongoDB Database Name (optional, defaults to 'voice_memory_demo')
10+
MONGODB_DB=voice_memory_demo
11+
12+
# VoyageAI API Key (for semantic embeddings)
13+
# Get yours at: https://www.voyageai.com/
14+
VOYAGE_AI_API_KEY=your_voyage_ai_api_key_here

apps/voice-memory-demo/.gitignore

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2+
3+
# dependencies
4+
/node_modules
5+
/.pnp
6+
.pnp.*
7+
.yarn/*
8+
!.yarn/patches
9+
!.yarn/plugins
10+
!.yarn/releases
11+
!.yarn/versions
12+
13+
# testing
14+
/coverage
15+
16+
# next.js
17+
/.next/
18+
/out/
19+
20+
# production
21+
/build
22+
23+
# misc
24+
.DS_Store
25+
*.pem
26+
27+
# debug
28+
npm-debug.log*
29+
yarn-debug.log*
30+
yarn-error.log*
31+
.pnpm-debug.log*
32+
33+
# env files (can opt-in for committing if needed)
34+
.env*
35+
36+
# vercel
37+
.vercel
38+
39+
# typescript
40+
*.tsbuildinfo
41+
next-env.d.ts

apps/voice-memory-demo/README.md

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
# Voice Memory Demo
2+
3+
A Next.js demo showcasing persistent memory for voice AI agents using **Gemini Live** and **MongoDB**.
4+
5+
Based on the article: [Building Persistent Memory for Voice AI Agents with MongoDB](../article.md)
6+
7+
## Features
8+
9+
- 🎙️ **Real-time Voice Interaction** - WebSocket-based voice communication with Gemini Live
10+
- 🧠 **Persistent Memory** - Store and retrieve memories across sessions using MongoDB
11+
- 🔧 **Memory as Tool** - AI decides when to store/retrieve information (not hardcoded rules)
12+
- 🔒 **User Isolation** - Each browser gets a unique ID for privacy
13+
- 🌐 **Global vs Private** - Gemini classifies memories and obfuscates PII in shared data
14+
15+
## Architecture
16+
17+
```
18+
┌─────────────────────┐ ┌──────────────────────┐
19+
│ Browser │ │ Gemini Live API │
20+
│ - Mic capture │◄───►│ WebSocket │
21+
│ - Audio playback │ │ gemini-live-2.5- │
22+
│ - UI │ │ flash-native-audio │
23+
└─────────────────────┘ └──────────────────────┘
24+
25+
│ Tool Calls
26+
27+
┌─────────────────────┐ ┌──────────────────────┐
28+
│ Next.js API │────►│ MongoDB Atlas │
29+
│ /api/memory │ │ memories collection│
30+
└─────────────────────┘ └──────────────────────┘
31+
```
32+
33+
## Quick Start
34+
35+
### 1. Clone and Install
36+
37+
```bash
38+
cd voice-memory-demo
39+
npm install
40+
```
41+
42+
### 2. Set Up Environment Variables
43+
44+
```bash
45+
cp .env.local.example .env.local
46+
```
47+
48+
Edit `.env.local` with your credentials:
49+
50+
```env
51+
GOOGLE_API_KEY=your_gemini_api_key
52+
MONGODB_URI=mongodb+srv://...
53+
MONGODB_DB=voice_memory_demo
54+
VOYAGE_AI_API_KEY=your_voyage_ai_key # Optional, for semantic search
55+
```
56+
57+
### 3. Run the Development Server
58+
59+
```bash
60+
npm run dev
61+
```
62+
63+
Open [http://localhost:3000](http://localhost:3000) in your browser.
64+
65+
### 4. Create MongoDB Indexes
66+
67+
For hybrid search (vector + text) with `$rankFusion`, you need to create two Atlas Search indexes on the `memories` collection:
68+
69+
#### Vector Search Index
70+
71+
**Index Name:** `memory_vector_index`
72+
73+
In Atlas UI: **Search Indexes****Create Search Index****Atlas Vector Search**
74+
75+
```json
76+
{
77+
"fields": [
78+
{
79+
"type": "vector",
80+
"path": "embedding",
81+
"numDimensions": 1024,
82+
"similarity": "cosine"
83+
},
84+
{
85+
"type": "filter",
86+
"path": "deploymentId"
87+
},
88+
{
89+
"type": "filter",
90+
"path": "userCookie"
91+
},
92+
{
93+
"type": "filter",
94+
"path": "isGlobal"
95+
}
96+
]
97+
}
98+
```
99+
100+
#### Atlas Search Index (Text)
101+
102+
**Index Name:** `memory_text_index`
103+
104+
In Atlas UI: **Search Indexes****Create Search Index****Atlas Search**
105+
106+
```json
107+
{
108+
"mappings": {
109+
"dynamic": false,
110+
"fields": {
111+
"key": {
112+
"type": "string",
113+
"analyzer": "lucene.standard"
114+
},
115+
"value": {
116+
"type": "string",
117+
"analyzer": "lucene.standard"
118+
},
119+
"deploymentId": {
120+
"type": "string"
121+
},
122+
"userCookie": {
123+
"type": "string"
124+
},
125+
"isGlobal": {
126+
"type": "boolean"
127+
}
128+
}
129+
}
130+
}
131+
```
132+
133+
> **Note:** If you don't set up these indexes, the app will fall back to regex-based search which is less accurate for semantic queries.
134+
135+
## How It Works
136+
137+
### Memory Tool
138+
139+
The AI agent has access to an `agentMemory` tool with four operations:
140+
141+
| Operation | Description | Example |
142+
|-----------|-------------|---------|
143+
| `set` | Store a key-value pair | `{op: "set", key: "user_name", value: "Pavel"}` |
144+
| `get` | Retrieve by key | `{op: "get", key: "user_name"}` |
145+
| `delete` | Remove a memory | `{op: "delete", key: "user_name"}` |
146+
| `query` | Search memories | `{op: "query", query: "user preferences"}` |
147+
148+
### Memory Classification
149+
150+
When storing a memory, Gemini classifies it:
151+
- **Private**: User-specific data (name, preferences, contact info)
152+
- **Global**: Shared facts (business hours, product info, policies)
153+
154+
Global memories have PII obfuscated (emails → `[EMAIL]`, phones → `[PHONE]`).
155+
156+
### User Isolation
157+
158+
Each browser generates a UUID stored in localStorage:
159+
- Private memories are scoped to this ID
160+
- Global memories are accessible to everyone
161+
- Users can reset their identity from the UI
162+
163+
## File Structure
164+
165+
```
166+
voice-memory-demo/
167+
├── src/
168+
│ ├── app/
169+
│ │ ├── api/memory/route.ts # Memory API endpoint
170+
│ │ ├── page.tsx # Main page
171+
│ │ ├── layout.tsx # App layout
172+
│ │ └── globals.css # Styles
173+
│ ├── components/
174+
│ │ ├── VoiceAgent.tsx # Main voice interface
175+
│ │ └── MemoryPanel.tsx # Memory debug panel
176+
│ ├── hooks/
177+
│ │ ├── useGeminiLive.ts # Gemini WebSocket hook
178+
│ │ └── useUserCookie.ts # User ID management
179+
│ └── lib/
180+
│ ├── mongodb.ts # MongoDB connection
181+
│ ├── memory-service.ts # Memory CRUD operations
182+
│ └── gemini-tools.ts # Tool definitions
183+
├── public/
184+
│ └── audio-processor.js # AudioWorklet for mic capture
185+
└── .env.local.example # Environment template
186+
```
187+
188+
## Try These Prompts
189+
190+
After connecting, try saying:
191+
192+
- "My name is [your name]"
193+
- "I live in [city]"
194+
- "I prefer email over phone calls"
195+
- "What do you remember about me?"
196+
- "What's my name?"
197+
- "Forget my name"
198+
199+
## Technical Details
200+
201+
### Audio Format
202+
203+
- **Input**: PCM 16-bit, 16kHz, mono
204+
- **Output**: PCM 16-bit, 24kHz, mono
205+
206+
### Model
207+
208+
Using `gemini-2.5-flash-native-audio-preview-12-2025` for real-time voice-to-voice interaction with tool calling support.
209+
210+
### MongoDB Schema
211+
212+
```javascript
213+
{
214+
deploymentId: "voice-memory-demo",
215+
key: "user_name",
216+
value: "Pavel",
217+
userCookie: "uuid-xxx" | "global",
218+
isGlobal: false,
219+
embedding: [0.123, -0.456, ...], // 1024-dim VoyageAI vector (if enabled)
220+
createdAt: ISODate(),
221+
updatedAt: ISODate()
222+
}
223+
```
224+
225+
## Troubleshooting
226+
227+
### "WebSocket connection error"
228+
- Check your `GOOGLE_API_KEY` is valid
229+
- Ensure you have access to the Gemini Live API
230+
231+
### No audio playback
232+
- Check browser permissions for audio
233+
- Try clicking the page first (browsers require user interaction)
234+
235+
### Memory not saving
236+
- Verify `MONGODB_URI` is correct
237+
- Check MongoDB network access (IP whitelist)
238+
239+
## License
240+
241+
MIT
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { defineConfig, globalIgnores } from "eslint/config";
2+
import nextVitals from "eslint-config-next/core-web-vitals";
3+
import nextTs from "eslint-config-next/typescript";
4+
5+
const eslintConfig = defineConfig([
6+
...nextVitals,
7+
...nextTs,
8+
// Override default ignores of eslint-config-next.
9+
globalIgnores([
10+
// Default ignores of eslint-config-next:
11+
".next/**",
12+
"out/**",
13+
"build/**",
14+
"next-env.d.ts",
15+
]),
16+
]);
17+
18+
export default eslintConfig;
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import type { NextConfig } from "next";
2+
3+
const nextConfig: NextConfig = {
4+
/* config options here */
5+
};
6+
7+
export default nextConfig;

0 commit comments

Comments
 (0)