1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { Content, SchemaUnion, Type } from '@google/genai';
import { DEFAULT_GEMINI_FLASH_MODEL } from '../config/models.js';
import { GeminiClient } from '../core/client.js';
import { GeminiChat } from '../core/geminiChat.js';
import { isFunctionResponse } from './messageInspectors.js';
const CHECK_PROMPT = `Analyze *only* the content and structure of your immediately preceding response (your last turn in the conversation history). Based *strictly* on that response, determine who should logically speak next: the 'user' or the 'model' (you).
**Decision Rules (apply in order):**
1. **Model Continues:** If your last response explicitly states an immediate next action *you* intend to take (e.g., "Next, I will...", "Now I'll process...", "Moving on to analyze...", indicates an intended tool call that didn't execute), OR if the response seems clearly incomplete (cut off mid-thought without a natural conclusion), then the **'model'** should speak next.
2. **Question to User:** If your last response ends with a direct question specifically addressed *to the user*, then the **'user'** should speak next.
3. **Waiting for User:** If your last response completed a thought, statement, or task *and* does not meet the criteria for Rule 1 (Model Continues) or Rule 2 (Question to User), it implies a pause expecting user input or reaction. In this case, the **'user'** should speak next.
**Output Format:**
Respond *only* in JSON format according to the following schema. Do not include any text outside the JSON structure.
\`\`\`json
{
"type": "object",
"properties": {
"reasoning": {
"type": "string",
"description": "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn."
},
"next_speaker": {
"type": "string",
"enum": ["user", "model"],
"description": "Who should speak next based *only* on the preceding turn and the decision rules."
}
},
"required": ["next_speaker", "reasoning"]
}
\`\`\`
`;
const RESPONSE_SCHEMA: SchemaUnion = {
type: Type.OBJECT,
properties: {
reasoning: {
type: Type.STRING,
description:
"Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn.",
},
next_speaker: {
type: Type.STRING,
enum: ['user', 'model'],
description:
'Who should speak next based *only* on the preceding turn and the decision rules',
},
},
required: ['reasoning', 'next_speaker'],
};
export interface NextSpeakerResponse {
reasoning: string;
next_speaker: 'user' | 'model';
}
export async function checkNextSpeaker(
chat: GeminiChat,
geminiClient: GeminiClient,
abortSignal: AbortSignal,
): Promise<NextSpeakerResponse | null> {
// We need to capture the curated history because there are many moments when the model will return invalid turns
// that when passed back up to the endpoint will break subsequent calls. An example of this is when the model decides
// to respond with an empty part collection if you were to send that message back to the server it will respond with
// a 400 indicating that model part collections MUST have content.
const curatedHistory = chat.getHistory(/* curated */ true);
// Ensure there's a model response to analyze
if (curatedHistory.length === 0) {
// Cannot determine next speaker if history is empty.
return null;
}
const comprehensiveHistory = chat.getHistory();
// If comprehensiveHistory is empty, there is no last message to check.
// This case should ideally be caught by the curatedHistory.length check earlier,
// but as a safeguard:
if (comprehensiveHistory.length === 0) {
return null;
}
const lastComprehensiveMessage =
comprehensiveHistory[comprehensiveHistory.length - 1];
// If the last message is a user message containing only function_responses,
// then the model should speak next.
if (
lastComprehensiveMessage &&
isFunctionResponse(lastComprehensiveMessage)
) {
return {
reasoning:
'The last message was a function response, so the model should speak next.',
next_speaker: 'model',
};
}
if (
lastComprehensiveMessage &&
lastComprehensiveMessage.role === 'model' &&
lastComprehensiveMessage.parts &&
lastComprehensiveMessage.parts.length === 0
) {
lastComprehensiveMessage.parts.push({ text: '' });
return {
reasoning:
'The last message was a filler model message with no content (nothing for user to act on), model should speak next.',
next_speaker: 'model',
};
}
// Things checked out. Let's proceed to potentially making an LLM request.
const lastMessage = curatedHistory[curatedHistory.length - 1];
if (!lastMessage || lastMessage.role !== 'model') {
// Cannot determine next speaker if the last turn wasn't from the model
// or if history is empty.
return null;
}
const contents: Content[] = [
...curatedHistory,
{ role: 'user', parts: [{ text: CHECK_PROMPT }] },
];
try {
const parsedResponse = (await geminiClient.generateJson(
contents,
RESPONSE_SCHEMA,
abortSignal,
DEFAULT_GEMINI_FLASH_MODEL,
)) as unknown as NextSpeakerResponse;
if (
parsedResponse &&
parsedResponse.next_speaker &&
['user', 'model'].includes(parsedResponse.next_speaker)
) {
return parsedResponse;
}
return null;
} catch (error) {
console.warn(
'Failed to talk to Gemini endpoint when seeing if conversation should continue.',
error,
);
return null;
}
}
|