diff --git a/client/src/components/Chat/Input/TokenUsage/index.tsx b/client/src/components/Chat/Input/TokenUsage/index.tsx
index e0140f6f28..d4ba946d31 100644
--- a/client/src/components/Chat/Input/TokenUsage/index.tsx
+++ b/client/src/components/Chat/Input/TokenUsage/index.tsx
@@ -72,11 +72,14 @@ function TokenUsageIndicator({
/** Config gate kept outside the indicator so disabled deployments mount nothing */
const TokenUsage = memo(function TokenUsage(props: TokenUsageProps) {
const { data: startupConfig } = useGetStartupConfig();
- if (startupConfig?.interface?.contextUsage === false) {
+ /** Wait for config before mounting: until it loads `contextUsage === false`
+ * reads as undefined, so a disabled deployment would briefly mount the
+ * indicator and fire the token-config query on first load */
+ if (startupConfig == null || startupConfig.interface?.contextUsage === false) {
return null;
}
return (
-
+
);
});
diff --git a/client/src/hooks/Chat/useTokenUsage.ts b/client/src/hooks/Chat/useTokenUsage.ts
index 190c477893..3d80388cec 100644
--- a/client/src/hooks/Chat/useTokenUsage.ts
+++ b/client/src/hooks/Chat/useTokenUsage.ts
@@ -7,12 +7,13 @@ import type { ContextSnapshot, UsageTotals } from '~/store/usage';
import type { BranchTotals } from '~/utils/tokens';
import {
liveTokensFamily,
+ removeUsageAtoms,
usageTotalsFamily,
branchTotalsFamily,
contextSnapshotFamily,
} from '~/store/usage';
+import { buildIndex, sumBranch, clearIndex, costFromUnits } from '~/utils';
import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage';
-import { buildIndex, sumBranch, costFromUnits } from '~/utils';
import { useTokenConfigQuery } from '~/data-provider';
import useTokenLimits from './useTokenLimits';
@@ -120,7 +121,17 @@ export default function useTokenUsage({
}
rebuild(event.query.state.data as TMessage[] | undefined);
});
- return unsubscribe;
+ return () => {
+ unsubscribe();
+ /** Bound memory to open conversations — drop this one's token index and
+ * usage atoms on switch/unmount; both rebuild from the query cache on
+ * return. NEW_CONVO is migrated to its real id by finalizeUsage, so
+ * leave it alone to avoid racing that handoff. */
+ if (conversationKey !== Constants.NEW_CONVO) {
+ clearIndex(conversationKey);
+ removeUsageAtoms(conversationKey);
+ }
+ };
}, [conversationKey, queryClient, setBranchTotals]);
useEffect(() => {
diff --git a/client/src/hooks/SSE/useUsageHandler.ts b/client/src/hooks/SSE/useUsageHandler.ts
index 9d5eb9a71b..eece21fe1c 100644
--- a/client/src/hooks/SSE/useUsageHandler.ts
+++ b/client/src/hooks/SSE/useUsageHandler.ts
@@ -172,7 +172,9 @@ export default function useUsageHandler(): UsageHandlers {
if (!folded || data.usage_type != null) {
return;
}
- confirmedRef.current += data.output_tokens ?? 0;
+ /** Use the repaired completion count (not raw output_tokens) so the
+ * snapshot gauge keeps the full response for under-reporting providers */
+ confirmedRef.current += normalizeUsageUnits(data).output;
streamCharsRef.current = 0;
setLive(getConvoKey(submission), confirmedRef.current);
};
diff --git a/packages/api/src/endpoints/custom/initialize.spec.ts b/packages/api/src/endpoints/custom/initialize.spec.ts
index 9aa7f0112a..c9d05a3001 100644
--- a/packages/api/src/endpoints/custom/initialize.spec.ts
+++ b/packages/api/src/endpoints/custom/initialize.spec.ts
@@ -376,9 +376,21 @@ describe('initializeCustom – token-config fetch header forwarding', () => {
} as unknown as BaseInitializeParams['db'],
};
- const result = (await initializeCustom(params)) as { endpointTokenConfig?: unknown };
+ const result = (await initializeCustom(params)) as {
+ endpointTokenConfig?: Record>;
+ };
expect(fetchModels).not.toHaveBeenCalled();
- expect(result.endpointTokenConfig).toEqual(tokenConfig);
+ /** Original rates pass through, plus the billing-shape cache keys so
+ * getCacheMultiplier (which reads `write`/`read`) finds them */
+ expect(result.endpointTokenConfig?.['gpt-4']).toEqual({
+ prompt: 1.5,
+ completion: 4.5,
+ context: 32000,
+ cacheRead: 0.3,
+ cacheWrite: 1.8,
+ write: 1.8,
+ read: 0.3,
+ });
});
});
diff --git a/packages/api/src/endpoints/custom/initialize.ts b/packages/api/src/endpoints/custom/initialize.ts
index 9a9e334dfe..0f83794920 100644
--- a/packages/api/src/endpoints/custom/initialize.ts
+++ b/packages/api/src/endpoints/custom/initialize.ts
@@ -36,6 +36,29 @@ export function getTokenConfigKey(
: endpoint;
}
+/**
+ * Maps an admin-facing static `tokenConfig` to the billing shape: the UI uses
+ * `cacheWrite`/`cacheRead`, but `getCacheMultiplier` indexes `write`/`read`.
+ * Adds those keys (preserving the originals) so cache tokens bill at the
+ * configured rate instead of the prompt-rate fallback.
+ */
+function toBillingTokenConfig(
+ tokenConfig: Record>,
+): EndpointTokenConfig {
+ const result: EndpointTokenConfig = {};
+ for (const [model, rates] of Object.entries(tokenConfig)) {
+ const mapped = { ...rates } as Record;
+ if (rates.cacheWrite != null) {
+ mapped.write = rates.cacheWrite;
+ }
+ if (rates.cacheRead != null) {
+ mapped.read = rates.cacheRead;
+ }
+ result[model] = mapped as EndpointTokenConfig[string];
+ }
+ return result;
+}
+
/**
* Builds custom options from endpoint configuration
*/
@@ -161,8 +184,12 @@ export async function initializeCustom({
if (hasTokenConfig) {
/** A static override is authoritative — use it for the agent's billing
- * and balance checks, not just the advertised UI token config */
- endpointTokenConfig = endpointConfig.tokenConfig as EndpointTokenConfig;
+ * and balance checks, not just the advertised UI token config. Mirror
+ * the admin-facing `cacheWrite`/`cacheRead` keys onto the `write`/`read`
+ * keys the billing multiplier reads. */
+ endpointTokenConfig = toBillingTokenConfig(
+ endpointConfig.tokenConfig as Record>,
+ );
} else {
const cachedConfig =
FetchTokenConfig[endpoint.toLowerCase() as keyof typeof FetchTokenConfig] &&