Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* [FEATURE] Distributor: Add experimental `-distributor.enable-start-timestamp` flag for Prometheus Remote Write 2.0. When enabled, `StartTimestamp (ST)` is ingested. #7371
* [FEATURE] Memberlist: Add `-memberlist.cluster-label` and `-memberlist.cluster-label-verification-disabled` to prevent accidental cross-cluster gossip joins and support rolling label rollout. #7385
* [FEATURE] Querier: Add timeout classification to classify query timeouts as 4XX (user error) or 5XX (system error) based on phase timing. When enabled, queries that spend most of their time in PromQL evaluation return `422 Unprocessable Entity` instead of `503 Service Unavailable`. #7374
* [FEATURE] Querier: Implement Resource Based Throttling in Querier. #7442
* [ENHANCEMENT] Parquet Converter: Add a ring status page to expose the ring status. #7455
* [ENHANCEMENT] Ingester: Add WAL record metrics to help evaluate the effectiveness of WAL compression type (e.g. snappy, zstd): `cortex_ingester_tsdb_wal_record_part_writes_total`, `cortex_ingester_tsdb_wal_record_parts_bytes_written_total`, and `cortex_ingester_tsdb_wal_record_bytes_saved_total`. #7420
* [ENHANCEMENT] Distributor: Introduce dynamic `Symbols` slice capacity pooling. #7398 #7401
Expand Down
17 changes: 17 additions & 0 deletions docs/blocks-storage/querier.md
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,23 @@ querier:
# Eval time threshold above which a timeout is classified as user error (4XX).
# CLI flag: -querier.timeout-classification-eval-threshold
[timeout_classification_eval_threshold: <duration> | default = 1m30s]

query_protection:
rejection:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -querier.query-protection.rejection.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -querier.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]
```

### `blocks_storage_config`
Expand Down
4 changes: 2 additions & 2 deletions docs/blocks-storage/store-gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,14 @@ store_gateway:
query_protection:
rejection:
threshold:
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -store-gateway.query-protection.rejection.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this ingester can reach before
# EXPERIMENTAL: Max heap utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
Expand Down
25 changes: 21 additions & 4 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -3859,14 +3859,14 @@ instance_limits:
query_protection:
rejection:
threshold:
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage, between
# 0 and 1. monitored_resources config must include the resource type. 0 to
# disable.
# CLI flag: -ingester.query-protection.rejection.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this ingester can reach before
# EXPERIMENTAL: Max heap utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage, between
# 0 and 1. monitored_resources config must include the resource type. 0 to
# disable.
Expand Down Expand Up @@ -4999,6 +4999,23 @@ thanos_engine:
# Eval time threshold above which a timeout is classified as user error (4XX).
# CLI flag: -querier.timeout-classification-eval-threshold
[timeout_classification_eval_threshold: <duration> | default = 1m30s]

query_protection:
rejection:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage, between
# 0 and 1. monitored_resources config must include the resource type. 0 to
# disable.
# CLI flag: -querier.query-protection.rejection.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage, between
# 0 and 1. monitored_resources config must include the resource type. 0 to
# disable.
# CLI flag: -querier.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]
```

### `query_frontend_config`
Expand Down Expand Up @@ -6754,14 +6771,14 @@ sharding_ring:
query_protection:
rejection:
threshold:
# EXPERIMENTAL: Max CPU utilization that this ingester can reach before
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage, between
# 0 and 1. monitored_resources config must include the resource type. 0 to
# disable.
# CLI flag: -store-gateway.query-protection.rejection.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this ingester can reach before
# EXPERIMENTAL: Max heap utilization that this instance can reach before
# rejecting new query request (across all tenants) in percentage, between
# 0 and 1. monitored_resources config must include the resource type. 0 to
# disable.
Expand Down
4 changes: 2 additions & 2 deletions pkg/configs/query_protection.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ type threshold struct {
}

func (cfg *QueryProtection) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
f.Float64Var(&cfg.Rejection.Threshold.CPUUtilization, prefix+"query-protection.rejection.threshold.cpu-utilization", 0, "EXPERIMENTAL: Max CPU utilization that this ingester can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
f.Float64Var(&cfg.Rejection.Threshold.HeapUtilization, prefix+"query-protection.rejection.threshold.heap-utilization", 0, "EXPERIMENTAL: Max heap utilization that this ingester can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
f.Float64Var(&cfg.Rejection.Threshold.CPUUtilization, prefix+"query-protection.rejection.threshold.cpu-utilization", 0, "EXPERIMENTAL: Max CPU utilization that this instance can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
f.Float64Var(&cfg.Rejection.Threshold.HeapUtilization, prefix+"query-protection.rejection.threshold.heap-utilization", 0, "EXPERIMENTAL: Max heap utilization that this instance can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
}

func (cfg *QueryProtection) Validate(monitoredResources flagext.StringSliceCSV) error {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cortex/cortex.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ func (c *Config) Validate(log log.Logger) error {
if err := c.Distributor.Validate(c.LimitsConfig); err != nil {
return errors.Wrap(err, "invalid distributor config")
}
if err := c.Querier.Validate(); err != nil {
if err := c.Querier.Validate(c.ResourceMonitor.Resources); err != nil {
return errors.Wrap(err, "invalid querier config")
}
if c.Querier.TimeoutClassificationEnabled && !c.Frontend.Handler.QueryStatsEnabled {
Expand Down
6 changes: 3 additions & 3 deletions pkg/cortex/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ func (t *Cortex) initQueryable() (serv services.Service, err error) {
querierRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "querier"}, prometheus.DefaultRegisterer)

// Create a querier queryable and PromQL engine
t.QuerierQueryable, t.ExemplarQueryable, t.QuerierEngine = querier.New(t.Cfg.Querier, t.OverridesConfig, t.Distributor, t.StoreQueryables, querierRegisterer, util_log.Logger, t.OverridesConfig.QueryPartialData)
t.QuerierQueryable, t.ExemplarQueryable, t.QuerierEngine = querier.New(t.Cfg.Querier, t.OverridesConfig, t.Distributor, t.StoreQueryables, querierRegisterer, util_log.Logger, t.OverridesConfig.QueryPartialData, t.ResourceMonitor)

// Use distributor as default MetadataQuerier
t.MetadataQuerier = t.Distributor
Expand Down Expand Up @@ -701,7 +701,7 @@ func (t *Cortex) initRuler() (serv services.Service, err error) {
queryEngine = engine.New(opts, t.Cfg.Ruler.ThanosEngine, rulerRegisterer)
} else {
// TODO: Consider wrapping logger to differentiate from querier module logger
queryable, _, queryEngine = querier.New(t.Cfg.Querier, t.OverridesConfig, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger, t.OverridesConfig.RulesPartialData)
queryable, _, queryEngine = querier.New(t.Cfg.Querier, t.OverridesConfig, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger, t.OverridesConfig.RulesPartialData, nil)
}

managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, pusher, queryable, queryEngine, t.OverridesConfig, metrics, prometheus.DefaultRegisterer)
Expand Down Expand Up @@ -956,7 +956,7 @@ func (t *Cortex) setupModuleManager() error {
Ingester: {IngesterService, OverridesConfig, API},
IngesterService: {OverridesConfig, RuntimeConfig, MemberlistKV, ResourceMonitor},
Flusher: {OverridesConfig, API},
Queryable: {OverridesConfig, DistributorService, OverridesConfig, Ring, API, StoreQueryable, MemberlistKV},
Queryable: {OverridesConfig, DistributorService, OverridesConfig, Ring, API, StoreQueryable, MemberlistKV, ResourceMonitor},
Querier: {TenantFederation},
StoreQueryable: {OverridesConfig, OverridesConfig, MemberlistKV, GrpcClientService},
QueryFrontendTripperware: {API, OverridesConfig},
Expand Down
Loading
Loading