From 21fba832d1b4ea7af43fb887d9b2b38fcf8210d0 Mon Sep 17 00:00:00 2001
From: Tommaso Sciortino <sciortino@gmail.com>
Date: Fri, 30 May 2025 18:25:47 -0700
Subject: Rename server->core (#638)

---
 packages/core/src/utils/nextSpeakerChecker.ts | 151 ++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 packages/core/src/utils/nextSpeakerChecker.ts

(limited to 'packages/core/src/utils/nextSpeakerChecker.ts')

diff --git a/packages/core/src/utils/nextSpeakerChecker.ts b/packages/core/src/utils/nextSpeakerChecker.ts
new file mode 100644
index 00000000..66fa4395
--- /dev/null
+++ b/packages/core/src/utils/nextSpeakerChecker.ts
@@ -0,0 +1,151 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { Content, SchemaUnion, Type } from '@google/genai';
+import { GeminiClient } from '../core/client.js';
+import { GeminiChat } from '../core/geminiChat.js';
+import { isFunctionResponse } from './messageInspectors.js';
+
+const CHECK_PROMPT = `Analyze *only* the content and structure of your immediately preceding response (your last turn in the conversation history). Based *strictly* on that response, determine who should logically speak next: the 'user' or the 'model' (you).
+**Decision Rules (apply in order):**
+1.  **Model Continues:** If your last response explicitly states an immediate next action *you* intend to take (e.g., "Next, I will...", "Now I'll process...", "Moving on to analyze...", indicates an intended tool call that didn't execute), OR if the response seems clearly incomplete (cut off mid-thought without a natural conclusion), then the **'model'** should speak next.
+2.  **Question to User:** If your last response ends with a direct question specifically addressed *to the user*, then the **'user'** should speak next.
+3.  **Waiting for User:** If your last response completed a thought, statement, or task *and* does not meet the criteria for Rule 1 (Model Continues) or Rule 2 (Question to User), it implies a pause expecting user input or reaction. In this case, the **'user'** should speak next.
+**Output Format:**
+Respond *only* in JSON format according to the following schema. Do not include any text outside the JSON structure.
+\`\`\`json
+{
+  "type": "object",
+  "properties": {
+    "reasoning": {
+        "type": "string",
+        "description": "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn."
+    },
+    "next_speaker": {
+      "type": "string",
+      "enum": ["user", "model"],
+      "description": "Who should speak next based *only* on the preceding turn and the decision rules."
+    }
+  },
+  "required": ["next_speaker", "reasoning"]
+}
+\`\`\`
+`;
+
+const RESPONSE_SCHEMA: SchemaUnion = {
+  type: Type.OBJECT,
+  properties: {
+    reasoning: {
+      type: Type.STRING,
+      description:
+        "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn.",
+    },
+    next_speaker: {
+      type: Type.STRING,
+      enum: ['user', 'model'],
+      description:
+        'Who should speak next based *only* on the preceding turn and the decision rules',
+    },
+  },
+  required: ['reasoning', 'next_speaker'],
+};
+
+export interface NextSpeakerResponse {
+  reasoning: string;
+  next_speaker: 'user' | 'model';
+}
+
+export async function checkNextSpeaker(
+  chat: GeminiChat,
+  geminiClient: GeminiClient,
+  abortSignal: AbortSignal,
+): Promise<NextSpeakerResponse | null> {
+  // We need to capture the curated history because there are many moments when the model will return invalid turns
+  // that when passed back up to the endpoint will break subsequent calls. An example of this is when the model decides
+  // to respond with an empty part collection if you were to send that message back to the server it will respond with
+  // a 400 indicating that model part collections MUST have content.
+  const curatedHistory = chat.getHistory(/* curated */ true);
+
+  // Ensure there's a model response to analyze
+  if (curatedHistory.length === 0) {
+    // Cannot determine next speaker if history is empty.
+    return null;
+  }
+
+  const comprehensiveHistory = chat.getHistory();
+  // If comprehensiveHistory is empty, there is no last message to check.
+  // This case should ideally be caught by the curatedHistory.length check earlier,
+  // but as a safeguard:
+  if (comprehensiveHistory.length === 0) {
+    return null;
+  }
+  const lastComprehensiveMessage =
+    comprehensiveHistory[comprehensiveHistory.length - 1];
+
+  // If the last message is a user message containing only function_responses,
+  // then the model should speak next.
+  if (
+    lastComprehensiveMessage &&
+    isFunctionResponse(lastComprehensiveMessage)
+  ) {
+    return {
+      reasoning:
+        'The last message was a function response, so the model should speak next.',
+      next_speaker: 'model',
+    };
+  }
+
+  if (
+    lastComprehensiveMessage &&
+    lastComprehensiveMessage.role === 'model' &&
+    lastComprehensiveMessage.parts &&
+    lastComprehensiveMessage.parts.length === 0
+  ) {
+    lastComprehensiveMessage.parts.push({ text: '' });
+    return {
+      reasoning:
+        'The last message was a filler model message with no content (nothing for user to act on), model should speak next.',
+      next_speaker: 'model',
+    };
+  }
+
+  // Things checked out. Lets proceed to potentially making an LLM request.
+
+  const lastMessage = curatedHistory[curatedHistory.length - 1];
+  if (!lastMessage || lastMessage.role !== 'model') {
+    // Cannot determine next speaker if the last turn wasn't from the model
+    // or if history is empty.
+    return null;
+  }
+
+  const contents: Content[] = [
+    ...curatedHistory,
+    { role: 'user', parts: [{ text: CHECK_PROMPT }] },
+  ];
+
+  try {
+    const parsedResponse = (await geminiClient.generateJson(
+      contents,
+      RESPONSE_SCHEMA,
+      abortSignal,
+    )) as unknown as NextSpeakerResponse;
+
+    if (
+      parsedResponse &&
+      parsedResponse.next_speaker &&
+      ['user', 'model'].includes(parsedResponse.next_speaker)
+    ) {
+      return parsedResponse;
+    }
+    return null;
+  } catch (error) {
+    console.warn(
+      'Failed to talk to Gemini endpoint when seeing if conversation should continue.',
+      error,
+    );
+    return null;
+  }
+}
-- 
cgit v1.2.3