222 lines
6.7 KiB
TypeScript
222 lines
6.7 KiB
TypeScript
|
|
// biome-ignore-all assist/source/organizeImports: ANT-ONLY import markers must not be reordered
|
|||
|
|
import { CONTEXT_1M_BETA_HEADER } from '../constants/betas.js'
|
|||
|
|
import { getGlobalConfig } from './config.js'
|
|||
|
|
import { isEnvTruthy } from './envUtils.js'
|
|||
|
|
import { getCanonicalName } from './model/model.js'
|
|||
|
|
import { getModelCapability } from './model/modelCapabilities.js'
|
|||
|
|
|
|||
|
|
// Model context window size (200k tokens for all models right now)
|
|||
|
|
export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
|
|||
|
|
|
|||
|
|
// Maximum output tokens for compact operations
|
|||
|
|
export const COMPACT_MAX_OUTPUT_TOKENS = 20_000
|
|||
|
|
|
|||
|
|
// Default max output tokens
|
|||
|
|
const MAX_OUTPUT_TOKENS_DEFAULT = 32_000
|
|||
|
|
const MAX_OUTPUT_TOKENS_UPPER_LIMIT = 64_000
|
|||
|
|
|
|||
|
|
// Capped default for slot-reservation optimization. BQ p99 output = 4,911
|
|||
|
|
// tokens, so 32k/64k defaults over-reserve 8-16× slot capacity. With the cap
|
|||
|
|
// enabled, <1% of requests hit the limit; those get one clean retry at 64k
|
|||
|
|
// (see query.ts max_output_tokens_escalate). Cap is applied in
|
|||
|
|
// claude.ts:getMaxOutputTokensForModel to avoid the growthbook→betas→context
|
|||
|
|
// import cycle.
|
|||
|
|
export const CAPPED_DEFAULT_MAX_TOKENS = 8_000
|
|||
|
|
export const ESCALATED_MAX_TOKENS = 64_000
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Check if 1M context is disabled via environment variable.
|
|||
|
|
* Used by C4E admins to disable 1M context for HIPAA compliance.
|
|||
|
|
*/
|
|||
|
|
export function is1mContextDisabled(): boolean {
|
|||
|
|
return isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_1M_CONTEXT)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function has1mContext(model: string): boolean {
|
|||
|
|
if (is1mContextDisabled()) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
return /\[1m\]/i.test(model)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// @[MODEL LAUNCH]: Update this pattern if the new model supports 1M context
|
|||
|
|
export function modelSupports1M(model: string): boolean {
|
|||
|
|
if (is1mContextDisabled()) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
const canonical = getCanonicalName(model)
|
|||
|
|
return canonical.includes('claude-sonnet-4') || canonical.includes('opus-4-6')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function getContextWindowForModel(
|
|||
|
|
model: string,
|
|||
|
|
betas?: string[],
|
|||
|
|
): number {
|
|||
|
|
// Allow override via environment variable (ant-only)
|
|||
|
|
// This takes precedence over all other context window resolution, including 1M detection,
|
|||
|
|
// so users can cap the effective context window for local decisions (auto-compact, etc.)
|
|||
|
|
// while still using a 1M-capable endpoint.
|
|||
|
|
if (
|
|||
|
|
process.env.USER_TYPE === 'ant' &&
|
|||
|
|
process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS
|
|||
|
|
) {
|
|||
|
|
const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10)
|
|||
|
|
if (!isNaN(override) && override > 0) {
|
|||
|
|
return override
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// [1m] suffix — explicit client-side opt-in, respected over all detection
|
|||
|
|
if (has1mContext(model)) {
|
|||
|
|
return 1_000_000
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const cap = getModelCapability(model)
|
|||
|
|
if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
|
|||
|
|
if (
|
|||
|
|
cap.max_input_tokens > MODEL_CONTEXT_WINDOW_DEFAULT &&
|
|||
|
|
is1mContextDisabled()
|
|||
|
|
) {
|
|||
|
|
return MODEL_CONTEXT_WINDOW_DEFAULT
|
|||
|
|
}
|
|||
|
|
return cap.max_input_tokens
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (betas?.includes(CONTEXT_1M_BETA_HEADER) && modelSupports1M(model)) {
|
|||
|
|
return 1_000_000
|
|||
|
|
}
|
|||
|
|
if (getSonnet1mExpTreatmentEnabled(model)) {
|
|||
|
|
return 1_000_000
|
|||
|
|
}
|
|||
|
|
if (process.env.USER_TYPE === 'ant') {
|
|||
|
|
const antModel = resolveAntModel(model)
|
|||
|
|
if (antModel?.contextWindow) {
|
|||
|
|
return antModel.contextWindow
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return MODEL_CONTEXT_WINDOW_DEFAULT
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function getSonnet1mExpTreatmentEnabled(model: string): boolean {
|
|||
|
|
if (is1mContextDisabled()) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
// Only applies to sonnet 4.6 without an explicit [1m] suffix
|
|||
|
|
if (has1mContext(model)) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
if (!getCanonicalName(model).includes('sonnet-4-6')) {
|
|||
|
|
return false
|
|||
|
|
}
|
|||
|
|
return getGlobalConfig().clientDataCache?.['coral_reef_sonnet'] === 'true'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Calculate context window usage percentage from token usage data.
|
|||
|
|
* Returns used and remaining percentages, or null values if no usage data.
|
|||
|
|
*/
|
|||
|
|
export function calculateContextPercentages(
|
|||
|
|
currentUsage: {
|
|||
|
|
input_tokens: number
|
|||
|
|
cache_creation_input_tokens: number
|
|||
|
|
cache_read_input_tokens: number
|
|||
|
|
} | null,
|
|||
|
|
contextWindowSize: number,
|
|||
|
|
): { used: number | null; remaining: number | null } {
|
|||
|
|
if (!currentUsage) {
|
|||
|
|
return { used: null, remaining: null }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const totalInputTokens =
|
|||
|
|
currentUsage.input_tokens +
|
|||
|
|
currentUsage.cache_creation_input_tokens +
|
|||
|
|
currentUsage.cache_read_input_tokens
|
|||
|
|
|
|||
|
|
const usedPercentage = Math.round(
|
|||
|
|
(totalInputTokens / contextWindowSize) * 100,
|
|||
|
|
)
|
|||
|
|
const clampedUsed = Math.min(100, Math.max(0, usedPercentage))
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
used: clampedUsed,
|
|||
|
|
remaining: 100 - clampedUsed,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Returns the model's default and upper limit for max output tokens.
|
|||
|
|
*/
|
|||
|
|
export function getModelMaxOutputTokens(model: string): {
|
|||
|
|
default: number
|
|||
|
|
upperLimit: number
|
|||
|
|
} {
|
|||
|
|
let defaultTokens: number
|
|||
|
|
let upperLimit: number
|
|||
|
|
|
|||
|
|
if (process.env.USER_TYPE === 'ant') {
|
|||
|
|
const antModel = resolveAntModel(model.toLowerCase())
|
|||
|
|
if (antModel) {
|
|||
|
|
defaultTokens = antModel.defaultMaxTokens ?? MAX_OUTPUT_TOKENS_DEFAULT
|
|||
|
|
upperLimit = antModel.upperMaxTokensLimit ?? MAX_OUTPUT_TOKENS_UPPER_LIMIT
|
|||
|
|
return { default: defaultTokens, upperLimit }
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const m = getCanonicalName(model)
|
|||
|
|
|
|||
|
|
if (m.includes('opus-4-6')) {
|
|||
|
|
defaultTokens = 64_000
|
|||
|
|
upperLimit = 128_000
|
|||
|
|
} else if (m.includes('sonnet-4-6')) {
|
|||
|
|
defaultTokens = 32_000
|
|||
|
|
upperLimit = 128_000
|
|||
|
|
} else if (
|
|||
|
|
m.includes('opus-4-5') ||
|
|||
|
|
m.includes('sonnet-4') ||
|
|||
|
|
m.includes('haiku-4')
|
|||
|
|
) {
|
|||
|
|
defaultTokens = 32_000
|
|||
|
|
upperLimit = 64_000
|
|||
|
|
} else if (m.includes('opus-4-1') || m.includes('opus-4')) {
|
|||
|
|
defaultTokens = 32_000
|
|||
|
|
upperLimit = 32_000
|
|||
|
|
} else if (m.includes('claude-3-opus')) {
|
|||
|
|
defaultTokens = 4_096
|
|||
|
|
upperLimit = 4_096
|
|||
|
|
} else if (m.includes('claude-3-sonnet')) {
|
|||
|
|
defaultTokens = 8_192
|
|||
|
|
upperLimit = 8_192
|
|||
|
|
} else if (m.includes('claude-3-haiku')) {
|
|||
|
|
defaultTokens = 4_096
|
|||
|
|
upperLimit = 4_096
|
|||
|
|
} else if (m.includes('3-5-sonnet') || m.includes('3-5-haiku')) {
|
|||
|
|
defaultTokens = 8_192
|
|||
|
|
upperLimit = 8_192
|
|||
|
|
} else if (m.includes('3-7-sonnet')) {
|
|||
|
|
defaultTokens = 32_000
|
|||
|
|
upperLimit = 64_000
|
|||
|
|
} else {
|
|||
|
|
defaultTokens = MAX_OUTPUT_TOKENS_DEFAULT
|
|||
|
|
upperLimit = MAX_OUTPUT_TOKENS_UPPER_LIMIT
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const cap = getModelCapability(model)
|
|||
|
|
if (cap?.max_tokens && cap.max_tokens >= 4_096) {
|
|||
|
|
upperLimit = cap.max_tokens
|
|||
|
|
defaultTokens = Math.min(defaultTokens, upperLimit)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return { default: defaultTokens, upperLimit }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Returns the max thinking budget tokens for a given model. The max
|
|||
|
|
* thinking tokens should be strictly less than the max output tokens.
|
|||
|
|
*
|
|||
|
|
* Deprecated since newer models use adaptive thinking rather than a
|
|||
|
|
* strict thinking token budget.
|
|||
|
|
*/
|
|||
|
|
export function getMaxThinkingTokensForModel(model: string): number {
|
|||
|
|
return getModelMaxOutputTokens(model).upperLimit - 1
|
|||
|
|
}
|