summaryrefslogtreecommitdiff
path: root/packages/server/src/utils/nextSpeakerChecker.ts
diff options
context:
space:
mode:
authorTaylor Mullen <[email protected]>2025-05-10 13:11:03 -0700
committerN. Taylor Mullen <[email protected]>2025-05-10 14:05:58 -0700
commitd159a1507e519da9b0bd92c2d2417ac0596f77fd (patch)
tree3e2bb0d49eb77b7dd6a7e25088bb6108c72e9289 /packages/server/src/utils/nextSpeakerChecker.ts
parentc0eab31c023e16de55302d06013a2abfb7f7aca9 (diff)
Don't prematurely end convo w/ Gemini.
- There seems to be a root model bug where the model will preemptively bail on conversations without trying harder. Typically the stops are VERY obvious and bug-looking where you need to prmopt the model to "continue". - This PR attempts to fix the above by running a 2.0-flash request (don't need somethign more powerful) at the end of every full interaction to see who should speak (user or model). - Add tests for nextSpeakerChecker Fixes https://b.corp.google.com/issues/416826051
Diffstat (limited to 'packages/server/src/utils/nextSpeakerChecker.ts')
-rw-r--r--packages/server/src/utils/nextSpeakerChecker.ts97
1 files changed, 97 insertions, 0 deletions
diff --git a/packages/server/src/utils/nextSpeakerChecker.ts b/packages/server/src/utils/nextSpeakerChecker.ts
new file mode 100644
index 00000000..f852879f
--- /dev/null
+++ b/packages/server/src/utils/nextSpeakerChecker.ts
@@ -0,0 +1,97 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { Chat, Content, SchemaUnion, Type } from '@google/genai';
+import { GeminiClient } from '../core/client.js';
+
+const CHECK_PROMPT = `Analyze *only* the content and structure of your immediately preceding response (your last turn in the conversation history). Based *strictly* on that response, determine who should logically speak next: the 'user' or the 'model' (you).
+**Decision Rules (apply in order):**
+1. **Model Continues:** If your last response explicitly states an immediate next action *you* intend to take (e.g., "Next, I will...", "Now I'll process...", "Moving on to analyze...", indicates an intended tool call that didn't execute), OR if the response seems clearly incomplete (cut off mid-thought without a natural conclusion), then the **'model'** should speak next.
+2. **Question to User:** If your last response ends with a direct question specifically addressed *to the user*, then the **'user'** should speak next.
+3. **Waiting for User:** If your last response completed a thought, statement, or task *and* does not meet the criteria for Rule 1 (Model Continues) or Rule 2 (Question to User), it implies a pause expecting user input or reaction. In this case, the **'user'** should speak next.
+**Output Format:**
+Respond *only* in JSON format according to the following schema. Do not include any text outside the JSON structure.
+\`\`\`json
+{
+ "type": "object",
+ "properties": {
+ "reasoning": {
+ "type": "string",
+ "description": "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn."
+ },
+ "next_speaker": {
+ "type": "string",
+ "enum": ["user", "model"],
+ "description": "Who should speak next based *only* on the preceding turn and the decision rules."
+ }
+ },
+ "required": ["next_speaker", "reasoning"]
+}
+\`\`\`
+`;
+
+const RESPONSE_SCHEMA: SchemaUnion = {
+ type: Type.OBJECT,
+ properties: {
+ reasoning: {
+ type: Type.STRING,
+ description:
+ "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn.",
+ },
+ next_speaker: {
+ type: Type.STRING,
+ enum: ['user', 'model'],
+ description:
+ 'Who should speak next based *only* on the preceding turn and the decision rules',
+ },
+ },
+ required: ['reasoning', 'next_speaker'],
+};
+
+export interface NextSpeakerResponse {
+ reasoning: string;
+ next_speaker: 'user' | 'model';
+}
+
+export async function checkNextSpeaker(
+ chat: Chat,
+ geminiClient: GeminiClient,
+): Promise<NextSpeakerResponse | null> {
+ const history = await chat.getHistory();
+ // Ensure there's a model response to analyze
+ if (history.length === 0 || history[history.length - 1].role !== 'model') {
+ // Cannot determine next speaker if the last turn wasn't from the model
+ // or if history is empty.
+ return null;
+ }
+
+ const contents: Content[] = [
+ ...history,
+ { role: 'user', parts: [{ text: CHECK_PROMPT }] },
+ ];
+
+ try {
+ const parsedResponse = (await geminiClient.generateJson(
+ contents,
+ RESPONSE_SCHEMA,
+ )) as unknown as NextSpeakerResponse;
+
+ if (
+ parsedResponse &&
+ parsedResponse.next_speaker &&
+ ['user', 'model'].includes(parsedResponse.next_speaker)
+ ) {
+ return parsedResponse;
+ }
+ return null;
+ } catch (error) {
+ console.warn(
+ 'Failed to talk to Gemiin endpoint when seeing if conversation should continue.',
+ error,
+ );
+ return null;
+ }
+}