14 files changed, 292 insertions, 86 deletions
diff --git a/packages/cli/src/ui/App.tsx b/packages/cli/src/ui/App.tsx
index 833cc2b5..e3a5eb55 100644
--- a/packages/cli/src/ui/App.tsx
+++ b/packages/cli/src/ui/App.tsx
@@ -70,6 +70,7 @@ import { UpdateNotification } from './components/UpdateNotification.js';
 import {
   isProQuotaExceededError,
   isGenericQuotaExceededError,
+  UserTierId,
 } from '@google/gemini-cli-core';
 import { checkForUpdates } from './utils/updateCheck.js';
 import ansiEscapes from 'ansi-escapes';
@@ -136,6 +137,8 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
   const ctrlDTimerRef = useRef<NodeJS.Timeout | null>(null);
   const [constrainHeight, setConstrainHeight] = useState<boolean>(true);
   const [showPrivacyNotice, setShowPrivacyNotice] = useState<boolean>(false);
+  const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
+    useState<boolean>(false);
 
   const openPrivacyNotice = useCallback(() => {
     setShowPrivacyNotice(true);
@@ -251,23 +254,51 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
     ): Promise<boolean> => {
       let message: string;
 
+      // For quota errors, assume FREE tier (safe default) - only show upgrade messaging to free tier users
+      // TODO: Get actual user tier from config when available
+      const userTier = undefined; // Defaults to FREE tier behavior
+      const isPaidTier =
+        userTier === UserTierId.LEGACY || userTier === UserTierId.STANDARD;
+
       // Check if this is a Pro quota exceeded error
       if (error && isProQuotaExceededError(error)) {
-        message = `⚡ You have reached your daily ${currentModel} quota limit.
+        if (isPaidTier) {
+          message = `⚡ You have reached your daily ${currentModel} quota limit.
+⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
+⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
+        } else {
+          message = `⚡ You have reached your daily ${currentModel} quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
+        }
       } else if (error && isGenericQuotaExceededError(error)) {
-        message = `⚡ You have reached your daily quota limit.
+        if (isPaidTier) {
+          message = `⚡ You have reached your daily quota limit.
+⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
+⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
+        } else {
+          message = `⚡ You have reached your daily quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
+        }
       } else {
-        // Default fallback message for other cases (like consecutive 429s)
-        message = `⚡ Slow response times detected.
-⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.`;
+        if (isPaidTier) {
+          // Default fallback message for other cases (like consecutive 429s)
+          message = `⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.
+⚡ Possible reasons for this are that you have received multiple consecutive capacity errors or you have reached your daily ${currentModel} quota limit
+⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
+        } else {
+          // Default fallback message for other cases (like consecutive 429s)
+          message = `⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.  
+⚡ Possible reasons for this are that you have received multiple consecutive capacity errors or you have reached your daily ${currentModel} quota limit
+⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
+⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
+⚡ You can switch authentication methods by typing /auth`;
+        }
       }
 
       // Add message to UI history
@@ -278,7 +309,14 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
         },
         Date.now(),
       );
-      return true; // Always accept the fallback
+
+      // Set the flag to prevent tool continuation
+      setModelSwitchedFromQuotaError(true);
+      // Set global quota error flag to prevent Flash model calls
+      config.setQuotaErrorOccurred(true);
+      // Switch model for future use but return false to stop current retry
+      config.setModel(fallbackModel);
+      return false; // Don't continue with current prompt
     };
 
     config.setFlashFallbackHandler(flashFallbackHandler);
@@ -445,6 +483,8 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
     getPreferredEditor,
     onAuthError,
     performMemoryRefresh,
+    modelSwitchedFromQuotaError,
+    setModelSwitchedFromQuotaError,
   );
   pendingHistoryItems.push(...pendingGeminiHistoryItems);
   const { elapsedTime, currentLoadingPhrase } =
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index fc6f93c5..62ade50f 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -301,6 +301,8 @@ describe('useGeminiStream', () => {
       getUsageStatisticsEnabled: () => true,
       getDebugMode: () => false,
       addHistory: vi.fn(),
+      setQuotaErrorOccurred: vi.fn(),
+      getQuotaErrorOccurred: vi.fn(() => false),
     } as unknown as Config;
     mockOnDebugMessage = vi.fn();
     mockHandleSlashCommand = vi.fn().mockResolvedValue(false);
@@ -386,6 +388,8 @@ describe('useGeminiStream', () => {
           () => 'vscode' as EditorType,
           () => {},
           () => Promise.resolve(),
+          false,
+          () => {},
         );
       },
       {
@@ -518,6 +522,8 @@ describe('useGeminiStream', () => {
         () => 'vscode' as EditorType,
         () => {},
         () => Promise.resolve(),
+        false,
+        () => {},
       ),
     );
 
@@ -582,6 +588,8 @@ describe('useGeminiStream', () => {
         () => 'vscode' as EditorType,
         () => {},
         () => Promise.resolve(),
+        false,
+        () => {},
       ),
     );
 
@@ -675,6 +683,8 @@ describe('useGeminiStream', () => {
         () => 'vscode' as EditorType,
         () => {},
         () => Promise.resolve(),
+        false,
+        () => {},
       ),
     );
 
@@ -775,6 +785,8 @@ describe('useGeminiStream', () => {
         () => 'vscode' as EditorType,
         () => {},
         () => Promise.resolve(),
+        false,
+        () => {},
       ),
     );
 
@@ -1063,6 +1075,8 @@ describe('useGeminiStream', () => {
           () => 'vscode' as EditorType,
           () => {},
           mockPerformMemoryRefresh,
+          false,
+          () => {},
         ),
       );
 
@@ -1113,6 +1127,8 @@ describe('useGeminiStream', () => {
           () => 'vscode' as EditorType,
           () => {},
           () => Promise.resolve(),
+          false,
+          () => {},
         ),
       );
 
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index 550cab86..d32c9ffa 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -90,6 +90,8 @@ export const useGeminiStream = (
   getPreferredEditor: () => EditorType | undefined,
   onAuthError: () => void,
   performMemoryRefresh: () => Promise<void>,
+  modelSwitchedFromQuotaError: boolean,
+  setModelSwitchedFromQuotaError: React.Dispatch<React.SetStateAction<boolean>>,
 ) => {
   const [initError, setInitError] = useState<string | null>(null);
   const abortControllerRef = useRef<AbortController | null>(null);
@@ -494,6 +496,12 @@ export const useGeminiStream = (
       const userMessageTimestamp = Date.now();
       setShowHelp(false);
 
+      // Reset quota error flag when starting a new query (not a continuation)
+      if (!options?.isContinuation) {
+        setModelSwitchedFromQuotaError(false);
+        config.setQuotaErrorOccurred(false);
+      }
+
       abortControllerRef.current = new AbortController();
       const abortSignal = abortControllerRef.current.signal;
       turnCancelledRef.current = false;
@@ -552,6 +560,7 @@ export const useGeminiStream = (
     [
       streamingState,
       setShowHelp,
+      setModelSwitchedFromQuotaError,
       prepareQueryForGemini,
       processGeminiStreamEvents,
       pendingHistoryItemRef,
@@ -668,6 +677,12 @@ export const useGeminiStream = (
       );
 
       markToolsAsSubmitted(callIdsToMarkAsSubmitted);
+
+      // Don't continue if model was switched due to quota error
+      if (modelSwitchedFromQuotaError) {
+        return;
+      }
+
       submitQuery(mergePartListUnions(responsesToSend), {
         isContinuation: true,
       });
@@ -678,6 +693,7 @@ export const useGeminiStream = (
       markToolsAsSubmitted,
       geminiClient,
       performMemoryRefresh,
+      modelSwitchedFromQuotaError,
     ],
   );
 
diff --git a/packages/cli/src/ui/utils/errorParsing.test.ts b/packages/cli/src/ui/utils/errorParsing.test.ts
index 3d228efb..770dffad 100644
--- a/packages/cli/src/ui/utils/errorParsing.test.ts
+++ b/packages/cli/src/ui/utils/errorParsing.test.ts
@@ -39,7 +39,7 @@ describe('parseAndFormatApiError', () => {
     );
     expect(result).toContain('[API Error: Rate limit exceeded');
     expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
     );
   });
 
@@ -55,7 +55,7 @@ describe('parseAndFormatApiError', () => {
     );
     expect(result).toContain('[API Error: Rate limit exceeded');
     expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
     );
   });
 
@@ -169,7 +169,7 @@ describe('parseAndFormatApiError', () => {
     );
     expect(result).toContain('[API Error: Rate limit exceeded');
     expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
     );
     expect(result).not.toContain(
       'You have reached your daily gemini-2.5-pro quota limit',
@@ -262,21 +262,17 @@ describe('parseAndFormatApiError', () => {
     );
   });
 
-  it('should handle different Gemini version strings in Pro quota exceeded errors', () => {
-    const errorMessage15 =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 1.5 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
+  it('should handle different Gemini 2.5 version strings in Pro quota exceeded errors', () => {
+    const errorMessage25 =
+      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 2.5 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
     const errorMessagePreview =
       'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 2.5-preview Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
-    const errorMessageBeta =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini beta-3.0 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
-    const errorMessageExperimental =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini experimental-v2 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
 
-    const result15 = parseAndFormatApiError(
-      errorMessage15,
+    const result25 = parseAndFormatApiError(
+      errorMessage25,
       AuthType.LOGIN_WITH_GOOGLE,
       undefined,
-      'gemini-1.5-pro',
+      'gemini-2.5-pro',
       DEFAULT_GEMINI_FLASH_MODEL,
     );
     const resultPreview = parseAndFormatApiError(
@@ -286,45 +282,19 @@ describe('parseAndFormatApiError', () => {
       'gemini-2.5-preview-pro',
       DEFAULT_GEMINI_FLASH_MODEL,
     );
-    const resultBeta = parseAndFormatApiError(
-      errorMessageBeta,
-      AuthType.LOGIN_WITH_GOOGLE,
-      undefined,
-      'gemini-beta-3.0-pro',
-      DEFAULT_GEMINI_FLASH_MODEL,
-    );
-    const resultExperimental = parseAndFormatApiError(
-      errorMessageExperimental,
-      AuthType.LOGIN_WITH_GOOGLE,
-      undefined,
-      'gemini-experimental-v2-pro',
-      DEFAULT_GEMINI_FLASH_MODEL,
-    );
 
-    expect(result15).toContain(
-      'You have reached your daily gemini-1.5-pro quota limit',
+    expect(result25).toContain(
+      'You have reached your daily gemini-2.5-pro quota limit',
     );
     expect(resultPreview).toContain(
       'You have reached your daily gemini-2.5-preview-pro quota limit',
     );
-    expect(resultBeta).toContain(
-      'You have reached your daily gemini-beta-3.0-pro quota limit',
-    );
-    expect(resultExperimental).toContain(
-      'You have reached your daily gemini-experimental-v2-pro quota limit',
-    );
-    expect(result15).toContain(
+    expect(result25).toContain(
       'upgrade to a Gemini Code Assist Standard or Enterprise plan',
     );
     expect(resultPreview).toContain(
       'upgrade to a Gemini Code Assist Standard or Enterprise plan',
     );
-    expect(resultBeta).toContain(
-      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
-    );
-    expect(resultExperimental).toContain(
-      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
-    );
   });
 
   it('should not match non-Pro models with similar version strings', () => {
@@ -339,16 +309,6 @@ describe('parseAndFormatApiError', () => {
         "Quota exceeded for quota metric 'Gemini 2.5-preview Flash Requests' and limit",
       ),
     ).toBe(false);
-    expect(
-      isProQuotaExceededError(
-        "Quota exceeded for quota metric 'Gemini beta-3.0 Flash Requests' and limit",
-      ),
-    ).toBe(false);
-    expect(
-      isProQuotaExceededError(
-        "Quota exceeded for quota metric 'Gemini experimental-v2 Flash Requests' and limit",
-      ),
-    ).toBe(false);
 
     // Test other model types
     expect(
diff --git a/packages/cli/src/ui/utils/errorParsing.ts b/packages/cli/src/ui/utils/errorParsing.ts
index 555d5e4e..5031bc0a 100644
--- a/packages/cli/src/ui/utils/errorParsing.ts
+++ b/packages/cli/src/ui/utils/errorParsing.ts
@@ -19,7 +19,7 @@ import {
 const getRateLimitErrorMessageGoogleFree = (
   fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
 
 const getRateLimitErrorMessageGoogleProQuotaFree = (
   currentModel: string = DEFAULT_GEMINI_MODEL,
@@ -34,7 +34,7 @@ const getRateLimitErrorMessageGoogleGenericQuotaFree = () =>
 const getRateLimitErrorMessageGooglePaid = (
   fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session. We appreciate you for choosing Gemini Code Assist and the Gemini CLI.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session. We appreciate you for choosing Gemini Code Assist and the Gemini CLI.`;
 
 const getRateLimitErrorMessageGoogleProQuotaPaid = (
   currentModel: string = DEFAULT_GEMINI_MODEL,
@@ -53,7 +53,7 @@ const RATE_LIMIT_ERROR_MESSAGE_VERTEX =
 const getRateLimitErrorMessageDefault = (
   fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
 
 function getRateLimitMessage(
   authType?: AuthType,
diff --git a/packages/core/src/code_assist/server.ts b/packages/core/src/code_assist/server.ts
index 06ce0341..01fd2462 100644
--- a/packages/core/src/code_assist/server.ts
+++ b/packages/core/src/code_assist/server.ts
@@ -31,7 +31,23 @@ import {
   toCountTokenRequest,
   toGenerateContentRequest,
 } from './converter.js';
-import { PassThrough } from 'node:stream';
+import { Readable } from 'node:stream';
+
+interface ErrorData {
+  error?: {
+    message?: string;
+  };
+}
+
+interface GaxiosResponse {
+  status: number;
+  data: unknown;
+}
+
+interface StreamError extends Error {
+  status?: number;
+  response?: GaxiosResponse;
+}
 
 /** HTTP options to be used in each of the requests. */
 export interface HttpOptions {
@@ -177,8 +193,45 @@ export class CodeAssistServer implements ContentGenerator {
     });
 
     return (async function* (): AsyncGenerator<T> {
+      // Convert ReadableStream to Node.js stream if needed
+      let nodeStream: NodeJS.ReadableStream;
+
+      if (res.data instanceof ReadableStream) {
+        // Convert Web ReadableStream to Node.js Readable stream
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        nodeStream = Readable.fromWeb(res.data as any);
+      } else if (
+        res.data &&
+        typeof (res.data as NodeJS.ReadableStream).on === 'function'
+      ) {
+        // Already a Node.js stream
+        nodeStream = res.data as NodeJS.ReadableStream;
+      } else {
+        // If res.data is not a stream, it might be an error response
+        // Try to extract error information from the response
+        let errorMessage =
+          'Response data is not a readable stream. This may indicate a server error or quota issue.';
+
+        if (res.data && typeof res.data === 'object') {
+          // Check if this is an error response with error details
+          const errorData = res.data as ErrorData;
+          if (errorData.error?.message) {
+            errorMessage = errorData.error.message;
+          } else if (typeof errorData === 'string') {
+            errorMessage = errorData;
+          }
+        }
+
+        // Create an error that looks like a quota error if it contains quota information
+        const error: StreamError = new Error(errorMessage);
+        // Add status and response properties so it can be properly handled by retry logic
+        error.status = res.status;
+        error.response = res;
+        throw error;
+      }
+
       const rl = readline.createInterface({
-        input: res.data as PassThrough,
+        input: nodeStream,
         crlfDelay: Infinity, // Recognizes '\r\n' and '\n' as line breaks
       });
 
diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts
index b0659a9d..51915fc8 100644
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@@ -104,7 +104,7 @@ export type FlashFallbackHandler = (
   currentModel: string,
   fallbackModel: string,
   error?: unknown,
-) => Promise<boolean>;
+) => Promise<boolean | string | null>;
 
 export interface ConfigParameters {
   sessionId: string;
@@ -183,6 +183,7 @@ export class Config {
   private readonly listExtensions: boolean;
   private readonly _activeExtensions: ActiveExtension[];
   flashFallbackHandler?: FlashFallbackHandler;
+  private quotaErrorOccurred: boolean = false;
 
   constructor(params: ConfigParameters) {
     this.sessionId = params.sessionId;
@@ -304,6 +305,14 @@ export class Config {
     this.flashFallbackHandler = handler;
   }
 
+  setQuotaErrorOccurred(value: boolean): void {
+    this.quotaErrorOccurred = value;
+  }
+
+  getQuotaErrorOccurred(): boolean {
+    return this.quotaErrorOccurred;
+  }
+
   getEmbeddingModel(): string {
     return this.embeddingModel;
   }
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 80680aca..cd77a3f7 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -178,6 +178,8 @@ describe('Gemini Client (client.ts)', () => {
         getProxy: vi.fn().mockReturnValue(undefined),
         getWorkingDir: vi.fn().mockReturnValue('/test/dir'),
         getFileService: vi.fn().mockReturnValue(fileService),
+        getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
+        setQuotaErrorOccurred: vi.fn(),
       };
       return mock as unknown as Config;
     });
@@ -351,7 +353,7 @@ describe('Gemini Client (client.ts)', () => {
       await client.generateJson(contents, schema, abortSignal);
 
       expect(mockGenerateContentFn).toHaveBeenCalledWith({
-        model: DEFAULT_GEMINI_FLASH_MODEL,
+        model: 'test-model', // Should use current model from config
         config: {
           abortSignal,
           systemInstruction: getCoreSystemPrompt(''),
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index b8996cbf..51aab961 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -262,6 +262,7 @@ export class GeminiClient {
     request: PartListUnion,
     signal: AbortSignal,
     turns: number = this.MAX_TURNS,
+    originalModel?: string,
   ): AsyncGenerator<ServerGeminiStreamEvent, Turn> {
     // Ensure turns never exceeds MAX_TURNS to prevent infinite loops
     const boundedTurns = Math.min(turns, this.MAX_TURNS);
@@ -269,6 +270,9 @@ export class GeminiClient {
       return new Turn(this.getChat());
     }
 
+    // Track the original model from the first call to detect model switching
+    const initialModel = originalModel || this.config.getModel();
+
     const compressed = await this.tryCompressChat();
     if (compressed) {
       yield { type: GeminiEventType.ChatCompressed, value: compressed };
@@ -279,6 +283,14 @@ export class GeminiClient {
       yield event;
     }
     if (!turn.pendingToolCalls.length && signal && !signal.aborted) {
+      // Check if model was switched during the call (likely due to quota error)
+      const currentModel = this.config.getModel();
+      if (currentModel !== initialModel) {
+        // Model was switched (likely due to quota error fallback)
+        // Don't continue with recursive call to prevent unwanted Flash execution
+        return turn;
+      }
+
       const nextSpeakerCheck = await checkNextSpeaker(
         this.getChat(),
         this,
@@ -288,7 +300,12 @@ export class GeminiClient {
         const nextRequest = [{ text: 'Please continue.' }];
         // This recursive call's events will be yielded out, but the final
         // turn object will be from the top-level call.
-        yield* this.sendMessageStream(nextRequest, signal, boundedTurns - 1);
+        yield* this.sendMessageStream(
+          nextRequest,
+          signal,
+          boundedTurns - 1,
+          initialModel,
+        );
       }
     }
     return turn;
@@ -298,9 +315,12 @@ export class GeminiClient {
     contents: Content[],
     schema: SchemaUnion,
     abortSignal: AbortSignal,
-    model: string = DEFAULT_GEMINI_FLASH_MODEL,
+    model?: string,
     config: GenerateContentConfig = {},
   ): Promise<Record<string, unknown>> {
+    // Use current model from config instead of hardcoded Flash model
+    const modelToUse =
+      model || this.config.getModel() || DEFAULT_GEMINI_FLASH_MODEL;
     try {
       const userMemory = this.config.getUserMemory();
       const systemInstruction = getCoreSystemPrompt(userMemory);
@@ -312,7 +332,7 @@ export class GeminiClient {
 
       const apiCall = () =>
         this.getContentGenerator().generateContent({
-          model,
+          model: modelToUse,
           config: {
             ...requestConfig,
             systemInstruction,
@@ -585,10 +605,14 @@ export class GeminiClient {
           fallbackModel,
           error,
         );
-        if (accepted) {
+        if (accepted !== false && accepted !== null) {
           this.config.setModel(fallbackModel);
           return fallbackModel;
         }
+        // Check if the model was switched manually in the handler
+        if (this.config.getModel() === fallbackModel) {
+          return null; // Model was switched but don't continue with current prompt
+        }
       } catch (error) {
         console.warn('Flash fallback handler failed:', error);
       }
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index bfaeb8f6..35e6bf6c 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -43,6 +43,8 @@ describe('GeminiChat', () => {
       }),
       getModel: vi.fn().mockReturnValue('gemini-pro'),
       setModel: vi.fn(),
+      getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
+      setQuotaErrorOccurred: vi.fn(),
       flashFallbackHandler: undefined,
     } as unknown as Config;
 
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 1be84f2e..2c149e93 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -217,10 +217,14 @@ export class GeminiChat {
           fallbackModel,
           error,
         );
-        if (accepted) {
+        if (accepted !== false && accepted !== null) {
           this.config.setModel(fallbackModel);
           return fallbackModel;
         }
+        // Check if the model was switched manually in the handler
+        if (this.config.getModel() === fallbackModel) {
+          return null; // Model was switched but don't continue with current prompt
+        }
       } catch (error) {
         console.warn('Flash fallback handler failed:', error);
       }
@@ -262,12 +266,25 @@ export class GeminiChat {
     let response: GenerateContentResponse;
 
     try {
-      const apiCall = () =>
-        this.contentGenerator.generateContent({
-          model: this.config.getModel() || DEFAULT_GEMINI_FLASH_MODEL,
+      const apiCall = () => {
+        const modelToUse = this.config.getModel() || DEFAULT_GEMINI_FLASH_MODEL;
+
+        // Prevent Flash model calls immediately after quota error
+        if (
+          this.config.getQuotaErrorOccurred() &&
+          modelToUse === DEFAULT_GEMINI_FLASH_MODEL
+        ) {
+          throw new Error(
+            'Please submit a new query to continue with the Flash model.',
+          );
+        }
+
+        return this.contentGenerator.generateContent({
+          model: modelToUse,
           contents: requestContents,
           config: { ...this.generationConfig, ...params.config },
         });
+      };
 
       response = await retryWithBackoff(apiCall, {
         shouldRetry: (error: Error) => {
@@ -354,12 +371,25 @@ export class GeminiChat {
     const startTime = Date.now();
 
     try {
-      const apiCall = () =>
-        this.contentGenerator.generateContentStream({
-          model: this.config.getModel(),
+      const apiCall = () => {
+        const modelToUse = this.config.getModel();
+
+        // Prevent Flash model calls immediately after quota error
+        if (
+          this.config.getQuotaErrorOccurred() &&
+          modelToUse === DEFAULT_GEMINI_FLASH_MODEL
+        ) {
+          throw new Error(
+            'Please submit a new query to continue with the Flash model.',
+          );
+        }
+
+        return this.contentGenerator.generateContentStream({
+          model: modelToUse,
           contents: requestContents,
           config: { ...this.generationConfig, ...params.config },
         });
+      };
 
       // Note: Retrying streams can be complex. If generateContentStream itself doesn't handle retries
       // for transient issues internally before yielding the async generator, this retry will re-initiate
diff --git a/packages/core/src/utils/editCorrector.test.ts b/packages/core/src/utils/editCorrector.test.ts
index bcf75dfe..cf9008ef 100644
--- a/packages/core/src/utils/editCorrector.test.ts
+++ b/packages/core/src/utils/editCorrector.test.ts
@@ -214,6 +214,8 @@ describe('editCorrector', () => {
         setAlwaysSkipModificationConfirmation: vi.fn((skip: boolean) => {
           configParams.alwaysSkipModificationConfirmation = skip;
         }),
+        getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
+        setQuotaErrorOccurred: vi.fn(),
       } as unknown as Config;
 
       callCount = 0;
@@ -654,6 +656,8 @@ describe('editCorrector', () => {
         setAlwaysSkipModificationConfirmation: vi.fn((skip: boolean) => {
           configParams.alwaysSkipModificationConfirmation = skip;
         }),
+        getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
+        setQuotaErrorOccurred: vi.fn(),
       } as unknown as Config;
 
       callCount = 0;
diff --git a/packages/core/src/utils/quotaErrorDetection.ts b/packages/core/src/utils/quotaErrorDetection.ts
index ec77f5ee..a8e87a5d 100644
--- a/packages/core/src/utils/quotaErrorDetection.ts
+++ b/packages/core/src/utils/quotaErrorDetection.ts
@@ -41,14 +41,23 @@ export function isProQuotaExceededError(error: unknown): boolean {
   // Check for Pro quota exceeded errors by looking for the specific pattern
   // This will match patterns like:
   // - "Quota exceeded for quota metric 'Gemini 2.5 Pro Requests'"
-  // - "Quota exceeded for quota metric 'Gemini 1.5-preview Pro Requests'"
-  // - "Quota exceeded for quota metric 'Gemini beta-3.0 Pro Requests'"
-  // - "Quota exceeded for quota metric 'Gemini experimental-v2 Pro Requests'"
+  // - "Quota exceeded for quota metric 'Gemini 2.5-preview Pro Requests'"
   // We use string methods instead of regex to avoid ReDoS vulnerabilities
 
-  const checkMessage = (message: string): boolean =>
-    message.includes("Quota exceeded for quota metric 'Gemini") &&
-    message.includes("Pro Requests'");
+  const checkMessage = (message: string): boolean => {
+    console.log('[DEBUG] isProQuotaExceededError checking message:', message);
+    const result =
+      message.includes("Quota exceeded for quota metric 'Gemini") &&
+      message.includes("Pro Requests'");
+    console.log('[DEBUG] isProQuotaExceededError result:', result);
+    return result;
+  };
+
+  // Log the full error object to understand its structure
+  console.log(
+    '[DEBUG] isProQuotaExceededError - full error object:',
+    JSON.stringify(error, null, 2),
+  );
 
   if (typeof error === 'string') {
     return checkMessage(error);
@@ -62,6 +71,38 @@ export function isProQuotaExceededError(error: unknown): boolean {
     return checkMessage(error.error.message);
   }
 
+  // Check if it's a Gaxios error with response data
+  if (error && typeof error === 'object' && 'response' in error) {
+    const gaxiosError = error as {
+      response?: {
+        data?: unknown;
+      };
+    };
+    if (gaxiosError.response && gaxiosError.response.data) {
+      console.log(
+        '[DEBUG] isProQuotaExceededError - checking response data:',
+        gaxiosError.response.data,
+      );
+      if (typeof gaxiosError.response.data === 'string') {
+        return checkMessage(gaxiosError.response.data);
+      }
+      if (
+        typeof gaxiosError.response.data === 'object' &&
+        gaxiosError.response.data !== null &&
+        'error' in gaxiosError.response.data
+      ) {
+        const errorData = gaxiosError.response.data as {
+          error?: { message?: string };
+        };
+        return checkMessage(errorData.error?.message || '');
+      }
+    }
+  }
+
+  console.log(
+    '[DEBUG] isProQuotaExceededError - no matching error format for:',
+    error,
+  );
   return false;
 }
 
diff --git a/packages/core/src/utils/retry.ts b/packages/core/src/utils/retry.ts
index 01651950..e5d65751 100644
--- a/packages/core/src/utils/retry.ts
+++ b/packages/core/src/utils/retry.ts
@@ -18,7 +18,7 @@ export interface RetryOptions {
   onPersistent429?: (
     authType?: string,
     error?: unknown,
-  ) => Promise<string | null>;
+  ) => Promise<string | boolean | null>;
   authType?: string;
 }
 
@@ -102,13 +102,16 @@ export async function retryWithBackoff<T>(
       ) {
         try {
           const fallbackModel = await onPersistent429(authType, error);
-          if (fallbackModel) {
+          if (fallbackModel !== false && fallbackModel !== null) {
             // Reset attempt counter and try with new model
             attempt = 0;
             consecutive429Count = 0;
             currentDelay = initialDelayMs;
             // With the model updated, we continue to the next attempt
             continue;
+          } else {
+            // Fallback handler returned null/false, meaning don't continue - stop retry process
+            throw error;
           }
         } catch (fallbackError) {
           // If fallback fails, continue with original error
@@ -126,13 +129,16 @@ export async function retryWithBackoff<T>(
       ) {
         try {
           const fallbackModel = await onPersistent429(authType, error);
-          if (fallbackModel) {
+          if (fallbackModel !== false && fallbackModel !== null) {
             // Reset attempt counter and try with new model
             attempt = 0;
             consecutive429Count = 0;
             currentDelay = initialDelayMs;
             // With the model updated, we continue to the next attempt
             continue;
+          } else {
+            // Fallback handler returned null/false, meaning don't continue - stop retry process
+            throw error;
           }
         } catch (fallbackError) {
           // If fallback fails, continue with original error
@@ -155,13 +161,16 @@ export async function retryWithBackoff<T>(
       ) {
         try {
           const fallbackModel = await onPersistent429(authType, error);
-          if (fallbackModel) {
+          if (fallbackModel !== false && fallbackModel !== null) {
             // Reset attempt counter and try with new model
             attempt = 0;
             consecutive429Count = 0;
             currentDelay = initialDelayMs;
             // With the model updated, we continue to the next attempt
             continue;
+          } else {
+            // Fallback handler returned null/false, meaning don't continue - stop retry process
+            throw error;
           }
         } catch (fallbackError) {
           // If fallback fails, continue with original error