🩹 fix: Repaired Output in Gauge, Cache-Rate Keys, Config Gate, Usage Cleanup

- live/completed gauge counts the repaired completion (normalized output), so under-reporting providers don't drop the response from used context - translate static tokenConfig cacheWrite/cacheRead onto the write/read keys getCacheMultiplier reads, so cache tokens bill at the configured rate instead of the prompt-rate fallback - clear the token index and usage atoms when leaving a conversation, so visited histories don't accumulate in memory for the tab's lifetime - wait for startupConfig before mounting the gauge, so a deployment with contextUsage disabled never briefly mounts it or fires the token-config query on first load
2026-06-20 20:20:42 +00:00 · 2026-06-13 13:25:45 -04:00 · 2026-06-13 13:25:45 -04:00 · df37d1f0ec
commit df37d1f0ec
parent 5777435aef
5 changed files with 64 additions and 9 deletions
--- a/client/src/components/Chat/Input/TokenUsage/index.tsx
+++ b/client/src/components/Chat/Input/TokenUsage/index.tsx
@ -72,11 +72,14 @@ function TokenUsageIndicator({
 /** Config gate kept outside the indicator so disabled deployments mount nothing */
 const TokenUsage = memo(function TokenUsage(props: TokenUsageProps) {
  const { data: startupConfig } = useGetStartupConfig();
-  if (startupConfig?.interface?.contextUsage === false) {
+  /** Wait for config before mounting: until it loads `contextUsage === false`
+   *  reads as undefined, so a disabled deployment would briefly mount the
+   *  indicator and fire the token-config query on first load */
+  if (startupConfig == null || startupConfig.interface?.contextUsage === false) {
    return null;
  }
  return (
-    <TokenUsageIndicator {...props} showCost={startupConfig?.interface?.contextCost === true} />
+    <TokenUsageIndicator {...props} showCost={startupConfig.interface?.contextCost === true} />
  );
 });

--- a/client/src/hooks/Chat/useTokenUsage.ts
+++ b/client/src/hooks/Chat/useTokenUsage.ts
@ -7,12 +7,13 @@ import type { ContextSnapshot, UsageTotals } from '~/store/usage';
 import type { BranchTotals } from '~/utils/tokens';
 import {
  liveTokensFamily,
+  removeUsageAtoms,
  usageTotalsFamily,
  branchTotalsFamily,
  contextSnapshotFamily,
 } from '~/store/usage';
+import { buildIndex, sumBranch, clearIndex, costFromUnits } from '~/utils';
 import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage';
-import { buildIndex, sumBranch, costFromUnits } from '~/utils';
 import { useTokenConfigQuery } from '~/data-provider';
 import useTokenLimits from './useTokenLimits';

@ -120,7 +121,17 @@ export default function useTokenUsage({
      }
      rebuild(event.query.state.data as TMessage[] | undefined);
    });
-    return unsubscribe;
+    return () => {
+      unsubscribe();
+      /** Bound memory to open conversations — drop this one's token index and
+       *  usage atoms on switch/unmount; both rebuild from the query cache on
+       *  return. NEW_CONVO is migrated to its real id by finalizeUsage, so
+       *  leave it alone to avoid racing that handoff. */
+      if (conversationKey !== Constants.NEW_CONVO) {
+        clearIndex(conversationKey);
+        removeUsageAtoms(conversationKey);
+      }
+    };
  }, [conversationKey, queryClient, setBranchTotals]);

  useEffect(() => {
--- a/client/src/hooks/SSE/useUsageHandler.ts
+++ b/client/src/hooks/SSE/useUsageHandler.ts
@ -172,7 +172,9 @@ export default function useUsageHandler(): UsageHandlers {
      if (!folded || data.usage_type != null) {
        return;
      }
-      confirmedRef.current += data.output_tokens ?? 0;
+      /** Use the repaired completion count (not raw output_tokens) so the
+       *  snapshot gauge keeps the full response for under-reporting providers */
+      confirmedRef.current += normalizeUsageUnits(data).output;
      streamCharsRef.current = 0;
      setLive(getConvoKey(submission), confirmedRef.current);
    };
--- a/packages/api/src/endpoints/custom/initialize.spec.ts
+++ b/packages/api/src/endpoints/custom/initialize.spec.ts
@ -376,9 +376,21 @@ describe('initializeCustom – token-config fetch header forwarding', () => {
      } as unknown as BaseInitializeParams['db'],
    };

-    const result = (await initializeCustom(params)) as { endpointTokenConfig?: unknown };
+    const result = (await initializeCustom(params)) as {
+      endpointTokenConfig?: Record<string, Record<string, number>>;
+    };

    expect(fetchModels).not.toHaveBeenCalled();
-    expect(result.endpointTokenConfig).toEqual(tokenConfig);
+    /** Original rates pass through, plus the billing-shape cache keys so
+     *  getCacheMultiplier (which reads `write`/`read`) finds them */
+    expect(result.endpointTokenConfig?.['gpt-4']).toEqual({
+      prompt: 1.5,
+      completion: 4.5,
+      context: 32000,
+      cacheRead: 0.3,
+      cacheWrite: 1.8,
+      write: 1.8,
+      read: 0.3,
+    });
  });
 });
--- a/packages/api/src/endpoints/custom/initialize.ts
+++ b/packages/api/src/endpoints/custom/initialize.ts
@ -36,6 +36,29 @@ export function getTokenConfigKey(
    : endpoint;
 }

+/**
+ * Maps an admin-facing static `tokenConfig` to the billing shape: the UI uses
+ * `cacheWrite`/`cacheRead`, but `getCacheMultiplier` indexes `write`/`read`.
+ * Adds those keys (preserving the originals) so cache tokens bill at the
+ * configured rate instead of the prompt-rate fallback.
+ */
+function toBillingTokenConfig(
+  tokenConfig: Record<string, Record<string, number>>,
+): EndpointTokenConfig {
+  const result: EndpointTokenConfig = {};
+  for (const [model, rates] of Object.entries(tokenConfig)) {
+    const mapped = { ...rates } as Record<string, number>;
+    if (rates.cacheWrite != null) {
+      mapped.write = rates.cacheWrite;
+    }
+    if (rates.cacheRead != null) {
+      mapped.read = rates.cacheRead;
+    }
+    result[model] = mapped as EndpointTokenConfig[string];
+  }
+  return result;
+}
+
 /**
 * Builds custom options from endpoint configuration
 */
@ -161,8 +184,12 @@ export async function initializeCustom({

  if (hasTokenConfig) {
    /** A static override is authoritative — use it for the agent's billing
-     *  and balance checks, not just the advertised UI token config */
-    endpointTokenConfig = endpointConfig.tokenConfig as EndpointTokenConfig;
+     *  and balance checks, not just the advertised UI token config. Mirror
+     *  the admin-facing `cacheWrite`/`cacheRead` keys onto the `write`/`read`
+     *  keys the billing multiplier reads. */
+    endpointTokenConfig = toBillingTokenConfig(
+      endpointConfig.tokenConfig as Record<string, Record<string, number>>,
+    );
  } else {
    const cachedConfig =
      FetchTokenConfig[endpoint.toLowerCase() as keyof typeof FetchTokenConfig] &&