claude-code-proxy/proxy/internal/config/config.go
sid 8e550b9785 Local fork: hardening + ops improvements (timeout knob, demotion, /livez, drain)
This commit captures both the prior accumulated work-in-progress
(framework migration web/→svelte/, postgres storage, conversation
viewer, dashboard auth, OpenAPI spec, integration tests) AND today's
operational improvements layered on top. History wasn't checkpointed
incrementally; happy to split it via interactive rebase if a reviewer
wants smaller commits.

Today's changes (in addition to the older WIP):

1. Configurable upstream response-header timeout
   - ANTHROPIC_RESPONSE_HEADER_TIMEOUT env (default 300s)
   - Replaces hardcoded 300s in provider/anthropic.go that was firing
     on opus + 1M-context + extended thinking non-streaming requests
   - Files: internal/config/config.go, internal/provider/anthropic.go

2. Structured forward-error diagnostic logging
   - When a forward to Anthropic fails, log a single key=value line
     with request_id, model, stream, body_bytes, has_thinking,
     anthropic_beta, query, elapsed, ctx_err — alongside the existing
     human-readable error line for back-compat
   - Files: internal/handler/handlers.go (logForwardFailure)

3. Full SSE protocol passthrough + Flusher fix
   - handler/handlers.go: forward all SSE lines verbatim (event:, id:,
     retry:, : comments, blank-line terminators), not only data:.
     Previous code produced malformed SSE for strict parsers.
   - middleware/logging.go: explicit Flush() method on responseWriter.
     Embedding http.ResponseWriter (interface) does not auto-promote
     Flush(), so every w.(http.Flusher) check in the streaming
     handler was returning ok=false and SSE writes buffered in net/http
     until the body closed.

4. Non-streaming → streaming demotion (feature-flagged)
   - ANTHROPIC_DEMOTE_NONSTREAMING env (default false)
   - When enabled and the routed provider is anthropic, force stream=true
     upstream for clients that asked for stream=false. Receive SSE,
     accumulate via accumulateSSEToMessage (handles text, tool_use with
     partial_json reassembly, thinking, signature, citations_delta,
     usage merge), and synthesize a single non-streaming JSON response.
   - Eliminates the ResponseHeaderTimeout class of failure entirely.
   - Body rewrite uses json.Decoder + UseNumber() to preserve integer
     precision in unknown nested fields (tool inputs from prior turns).
   - Files: internal/config/config.go, internal/handler/handlers.go,
     cmd/proxy/main.go, cmd/proxy/main_test.go

5. Live operational state: /livez gauge + graceful drain
   - New internal/runtime package: atomic in-flight counter + draining flag
   - New middleware/inflight.go: increments runtime gauge, applied to
     /v1/* subrouter so Messages, ChatCompletions, and ProxyPassthrough
     are all counted
   - /v1/* moved to a gorilla/mux subrouter so the InFlight middleware
     applies surgically; /health, /livez, /openapi.* remain on parent
     router (unauthenticated, uncounted)
   - Health handler returns 503 draining when runtime.IsDraining() is
     true, so Traefik stops routing to a slot before drain begins
   - New /livez handler returns {status, in_flight, draining, timestamp}
   - SIGTERM handler in main.go: SetDraining(true), poll for in_flight==0
     with 32-min ceiling and 1s tick (logs every 10s), then srv.Shutdown
   - Auth bypass list extended with /livez
   - Files: internal/runtime/runtime.go (new),
     internal/middleware/inflight.go (new),
     internal/middleware/auth.go,
     internal/handler/handlers.go (Health, Livez, runtime import),
     cmd/proxy/main.go (subrouter, drain loop)

6. OpenAPI spec updates
   - Document Health 503 response and new DrainingResponse schema
   - Add /livez path with LivezResponse schema
   - Files: internal/handler/openapi.go

Verified: go build ./... clean, go test ./... all pass, go vet clean.
Three rounds of codex peer review across changes 1-5; all feedback
addressed (citations_delta, json.Number precision, drain-loop logging
via lastLog timestamp, PathPrefix tightened to "/v1/").
2026-05-02 15:15:58 -06:00

445 lines
13 KiB
Go

package config
import (
"fmt"
"net"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/joho/godotenv"
"gopkg.in/yaml.v3"
)
type Config struct {
Server ServerConfig `yaml:"server"`
Providers ProvidersConfig `yaml:"providers"`
Storage StorageConfig `yaml:"storage"`
Subagents SubagentsConfig `yaml:"subagents"`
Auth AuthConfig `yaml:"auth"`
CORS CORSConfig `yaml:"cors"`
}
type CORSConfig struct {
AllowedOrigins []string `yaml:"allowed_origins"`
AllowedMethods []string `yaml:"allowed_methods"`
AllowedHeaders []string `yaml:"allowed_headers"`
}
type ServerConfig struct {
Host string `yaml:"host"`
Port string `yaml:"port"`
Timeouts TimeoutsConfig `yaml:"timeouts"`
// Legacy fields
ReadTimeout time.Duration
WriteTimeout time.Duration
IdleTimeout time.Duration
}
type TimeoutsConfig struct {
Read string `yaml:"read"`
Write string `yaml:"write"`
Idle string `yaml:"idle"`
}
type ProvidersConfig struct {
Anthropic AnthropicProviderConfig `yaml:"anthropic"`
OpenAI OpenAIProviderConfig `yaml:"openai"`
}
type AnthropicProviderConfig struct {
BaseURL string `yaml:"base_url"`
Version string `yaml:"version"`
MaxRetries int `yaml:"max_retries"`
ResponseHeaderTimeout time.Duration `yaml:"response_header_timeout"`
DemoteNonstreaming bool `yaml:"demote_nonstreaming"`
}
type OpenAIProviderConfig struct {
BaseURL string `yaml:"base_url"`
APIKey string `yaml:"api_key"`
AllowClientAPIKey bool `yaml:"allow_client_api_key"` // Allow clients to provide their own API key
ClientAPIKeyHeader string `yaml:"client_api_key_header"` // Header name for client API key (default: x-openai-api-key)
}
type AuthConfig struct {
Enabled bool `yaml:"enabled"`
Token string `yaml:"token"`
APIKeyHeader string `yaml:"api_key_header"`
AllowLocalhostBypass bool `yaml:"allow_localhost_bypass"`
DashboardPassword string `yaml:"dashboard_password"`
TrustProxy bool `yaml:"trust_proxy"` // Skip bind-address auth check (for Docker / reverse-proxy setups)
}
type StorageConfig struct {
RequestsDir string `yaml:"requests_dir"`
DBType string `yaml:"db_type"`
DBPath string `yaml:"db_path"`
DatabaseURL string `yaml:"database_url"`
CaptureRequestBody bool `yaml:"capture_request_body"`
CaptureResponseBody bool `yaml:"capture_response_body"`
MetadataOnly bool `yaml:"metadata_only"`
RetentionDays int `yaml:"retention_days"`
RedactedFields []string `yaml:"redacted_fields"`
}
type SubagentsConfig struct {
Enable bool `yaml:"enable"`
Mappings map[string]string `yaml:"mappings"`
}
func Load() (*Config, error) {
// Load .env file if it exists
// Look for .env file in the project root (one level up from proxy/)
envPath := filepath.Join("..", ".env")
if err := godotenv.Load(envPath); err != nil {
// If .env doesn't exist in parent directory, try current directory
if err := godotenv.Load(".env"); err != nil {
// .env file is optional, so we just log and continue
// This allows the app to work with system environment variables only
}
}
cfg := defaultConfig()
if err := loadFirstAvailableConfig(cfg, candidateConfigPaths()); err != nil {
return nil, err
}
// Apply environment variable overrides AFTER loading from file
if envPort := os.Getenv("PORT"); envPort != "" {
cfg.Server.Port = envPort
}
if envHost := os.Getenv("SERVER_HOST"); envHost != "" {
cfg.Server.Host = envHost
}
if envTimeout := os.Getenv("READ_TIMEOUT"); envTimeout != "" {
cfg.Server.ReadTimeout = getDuration("READ_TIMEOUT", cfg.Server.ReadTimeout)
}
if envTimeout := os.Getenv("WRITE_TIMEOUT"); envTimeout != "" {
cfg.Server.WriteTimeout = getDuration("WRITE_TIMEOUT", cfg.Server.WriteTimeout)
}
if envTimeout := os.Getenv("IDLE_TIMEOUT"); envTimeout != "" {
cfg.Server.IdleTimeout = getDuration("IDLE_TIMEOUT", cfg.Server.IdleTimeout)
}
// Override Anthropic settings
if envURL := os.Getenv("ANTHROPIC_FORWARD_URL"); envURL != "" {
cfg.Providers.Anthropic.BaseURL = envURL
}
if envVersion := os.Getenv("ANTHROPIC_VERSION"); envVersion != "" {
cfg.Providers.Anthropic.Version = envVersion
}
if envRetries := os.Getenv("ANTHROPIC_MAX_RETRIES"); envRetries != "" {
cfg.Providers.Anthropic.MaxRetries = getInt("ANTHROPIC_MAX_RETRIES", cfg.Providers.Anthropic.MaxRetries)
}
if envTimeout := os.Getenv("ANTHROPIC_RESPONSE_HEADER_TIMEOUT"); envTimeout != "" {
cfg.Providers.Anthropic.ResponseHeaderTimeout = getDuration("ANTHROPIC_RESPONSE_HEADER_TIMEOUT", cfg.Providers.Anthropic.ResponseHeaderTimeout)
}
if os.Getenv("ANTHROPIC_DEMOTE_NONSTREAMING") != "" {
cfg.Providers.Anthropic.DemoteNonstreaming = envBool("ANTHROPIC_DEMOTE_NONSTREAMING")
}
// Override OpenAI settings
if envURL := os.Getenv("OPENAI_BASE_URL"); envURL != "" {
cfg.Providers.OpenAI.BaseURL = envURL
}
if envKey := os.Getenv("OPENAI_API_KEY"); envKey != "" {
cfg.Providers.OpenAI.APIKey = envKey
}
if os.Getenv("OPENAI_ALLOW_CLIENT_API_KEY") != "" {
cfg.Providers.OpenAI.AllowClientAPIKey = envBool("OPENAI_ALLOW_CLIENT_API_KEY")
}
if envHeader := os.Getenv("OPENAI_CLIENT_API_KEY_HEADER"); envHeader != "" {
cfg.Providers.OpenAI.ClientAPIKeyHeader = envHeader
}
// Override auth settings
if os.Getenv("AUTH_ENABLED") != "" {
cfg.Auth.Enabled = envBool("AUTH_ENABLED")
}
if envAuthToken := os.Getenv("AUTH_TOKEN"); envAuthToken != "" {
cfg.Auth.Token = envAuthToken
}
if envAPIKeyHeader := os.Getenv("AUTH_API_KEY_HEADER"); envAPIKeyHeader != "" {
cfg.Auth.APIKeyHeader = envAPIKeyHeader
}
if os.Getenv("AUTH_ALLOW_LOCALHOST_BYPASS") != "" {
cfg.Auth.AllowLocalhostBypass = envBool("AUTH_ALLOW_LOCALHOST_BYPASS")
}
if envDashPass := os.Getenv("DASHBOARD_PASSWORD"); envDashPass != "" {
cfg.Auth.DashboardPassword = envDashPass
}
if os.Getenv("TRUST_PROXY") != "" {
cfg.Auth.TrustProxy = envBool("TRUST_PROXY")
}
// Override storage settings
if envDBType := os.Getenv("DB_TYPE"); envDBType != "" {
cfg.Storage.DBType = envDBType
}
if envPath := os.Getenv("DB_PATH"); envPath != "" {
cfg.Storage.DBPath = envPath
}
if envDatabaseURL := os.Getenv("DATABASE_URL"); envDatabaseURL != "" {
cfg.Storage.DatabaseURL = envDatabaseURL
}
if os.Getenv("STORAGE_CAPTURE_REQUEST_BODY") != "" {
cfg.Storage.CaptureRequestBody = envBool("STORAGE_CAPTURE_REQUEST_BODY")
}
if os.Getenv("STORAGE_CAPTURE_RESPONSE_BODY") != "" {
cfg.Storage.CaptureResponseBody = envBool("STORAGE_CAPTURE_RESPONSE_BODY")
}
if os.Getenv("STORAGE_METADATA_ONLY") != "" {
cfg.Storage.MetadataOnly = envBool("STORAGE_METADATA_ONLY")
}
if envRetentionDays := os.Getenv("STORAGE_RETENTION_DAYS"); envRetentionDays != "" {
cfg.Storage.RetentionDays = getInt("STORAGE_RETENTION_DAYS", cfg.Storage.RetentionDays)
}
if envRedacted := os.Getenv("STORAGE_REDACTED_FIELDS"); envRedacted != "" {
cfg.Storage.RedactedFields = splitAndTrim(envRedacted)
}
if cfg.Storage.MetadataOnly {
cfg.Storage.CaptureRequestBody = false
cfg.Storage.CaptureResponseBody = false
}
// Override CORS settings (comma-separated values)
if envOrigins := os.Getenv("CORS_ALLOWED_ORIGINS"); envOrigins != "" {
cfg.CORS.AllowedOrigins = splitAndTrim(envOrigins)
}
if envMethods := os.Getenv("CORS_ALLOWED_METHODS"); envMethods != "" {
cfg.CORS.AllowedMethods = splitAndTrim(envMethods)
}
if envHeaders := os.Getenv("CORS_ALLOWED_HEADERS"); envHeaders != "" {
cfg.CORS.AllowedHeaders = splitAndTrim(envHeaders)
}
// After loading from file, apply any timeout conversions if needed
if cfg.Server.Timeouts.Read != "" {
if duration, err := time.ParseDuration(cfg.Server.Timeouts.Read); err == nil {
cfg.Server.ReadTimeout = duration
}
}
if cfg.Server.Timeouts.Write != "" {
if duration, err := time.ParseDuration(cfg.Server.Timeouts.Write); err == nil {
cfg.Server.WriteTimeout = duration
}
}
if cfg.Server.Timeouts.Idle != "" {
if duration, err := time.ParseDuration(cfg.Server.Timeouts.Idle); err == nil {
cfg.Server.IdleTimeout = duration
}
}
if err := validateSecurity(cfg); err != nil {
return nil, err
}
return cfg, nil
}
func (c *Config) loadFromFile(path string) error {
data, err := os.ReadFile(path)
if err != nil {
return err
}
return yaml.Unmarshal(data, c)
}
func defaultConfig() *Config {
return &Config{
Server: ServerConfig{
Host: "0.0.0.0",
Port: "3001",
ReadTimeout: 600 * time.Second,
WriteTimeout: 600 * time.Second,
IdleTimeout: 600 * time.Second,
},
Providers: ProvidersConfig{
Anthropic: AnthropicProviderConfig{
BaseURL: "https://api.anthropic.com",
Version: "2023-06-01",
MaxRetries: 3,
ResponseHeaderTimeout: 300 * time.Second,
},
OpenAI: OpenAIProviderConfig{
BaseURL: "https://api.openai.com",
APIKey: "",
AllowClientAPIKey: false,
ClientAPIKeyHeader: "x-openai-api-key",
},
},
Storage: StorageConfig{
DBType: "sqlite",
DBPath: "requests.db",
CaptureRequestBody: true,
CaptureResponseBody: true,
MetadataOnly: false,
RetentionDays: 0,
RedactedFields: []string{
"api_key",
"authorization",
"token",
"password",
"secret",
"access_token",
"refresh_token",
"client_secret",
},
},
Subagents: SubagentsConfig{
Enable: false,
Mappings: make(map[string]string),
},
Auth: AuthConfig{
Enabled: false,
Token: "",
APIKeyHeader: "x-api-key",
AllowLocalhostBypass: true,
},
CORS: CORSConfig{
AllowedOrigins: []string{"*"},
AllowedMethods: []string{"GET", "POST", "DELETE", "OPTIONS"},
AllowedHeaders: []string{
"Accept",
"Authorization",
"Content-Type",
"Anthropic-Version",
"Anthropic-Beta",
"X-API-Key",
"X-Requested-With",
},
},
}
}
func candidateConfigPaths() []string {
paths := []string{
filepath.Join(filepath.Dir(os.Args[0]), "..", "config.yaml"),
"config.yaml",
"../config.yaml",
"../../config.yaml",
}
seen := make(map[string]struct{}, len(paths))
unique := make([]string, 0, len(paths))
for _, path := range paths {
if _, ok := seen[path]; ok {
continue
}
seen[path] = struct{}{}
unique = append(unique, path)
}
return unique
}
func validateSecurity(cfg *Config) error {
if cfg.Server.Host == "" {
cfg.Server.Host = "0.0.0.0"
}
// When behind a reverse proxy (Docker/Traefik), skip the bind-address auth requirement.
// The proxy is not directly exposed; the reverse proxy handles access control.
if cfg.Auth.TrustProxy {
return nil
}
if !isLoopbackHost(cfg.Server.Host) && !cfg.Auth.Enabled {
return fmt.Errorf("refusing to bind to %q without auth enabled; set AUTH_ENABLED=true and AUTH_TOKEN, or TRUST_PROXY=true for reverse-proxy setups", cfg.Server.Host)
}
if cfg.Auth.Enabled && cfg.Auth.Token == "" && !isLoopbackHost(cfg.Server.Host) {
return fmt.Errorf("auth is enabled for public access but AUTH_TOKEN is empty")
}
return nil
}
func isLoopbackHost(host string) bool {
host = strings.TrimSpace(host)
if host == "localhost" {
return true
}
if ip := net.ParseIP(strings.Trim(host, "[]")); ip != nil {
return ip.IsLoopback()
}
return false
}
func loadFirstAvailableConfig(cfg *Config, paths []string) error {
for _, path := range paths {
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
continue
}
return fmt.Errorf("failed to stat config file %q: %w", path, err)
}
if err := cfg.loadFromFile(path); err != nil {
return fmt.Errorf("failed to load config file %q: %w", path, err)
}
return nil
}
return nil
}
func envBool(key string) bool {
v := strings.ToLower(os.Getenv(key))
return v == "true" || v == "1" || v == "yes"
}
func getEnv(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func getDuration(key string, defaultValue time.Duration) time.Duration {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
duration, err := time.ParseDuration(value)
if err != nil {
return defaultValue
}
return duration
}
func getInt(key string, defaultValue int) int {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
intValue, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return intValue
}
func splitAndTrim(s string) []string {
parts := strings.Split(s, ",")
result := make([]string, 0, len(parts))
for _, part := range parts {
trimmed := strings.TrimSpace(part)
if trimmed != "" {
result = append(result, trimmed)
}
}
return result
}