Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 68 additions & 6 deletions tencentcloud/common/abstract_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
from tencentcloud.common.profile.client_profile import ClientProfile, RegionBreakerProfile
from tencentcloud.common.sign import Sign
from tencentcloud.common.circuit_breaker import CircuitBreaker
from tencentcloud.common.domain_failover import (
DomainFailoverManager, _classify_exception, is_failover_triggered,
)
from tencentcloud.common.retry import NoopRetryer

warnings.filterwarnings("ignore", module="tencentcloud", category=UserWarning)
Expand Down Expand Up @@ -89,6 +92,10 @@ def __init__(self, credential, region, profile=None):
if self.profile.region_breaker_profile is None:
self.profile.region_breaker_profile = RegionBreakerProfile()
self.circuit_breaker = CircuitBreaker(self.profile.region_breaker_profile)

# Domain-level failover manager (SDK internal mechanism, completely transparent to users, .com → .com.cn → .cn)
self.domain_failover = DomainFailoverManager()

if self.profile.request_client:
self.request_client = self._sdkVersion + "; " + self.profile.request_client
else:
Expand Down Expand Up @@ -427,16 +434,71 @@ def _call(self, action, params, options=None, headers=None):
headers["X-TC-TraceId"] = str(uuid.uuid4())
if not self.profile.disable_region_breaker:
return self._call_with_region_breaker(action, params, options, headers)
req = RequestInternal(self._get_endpoint(options=options),
self.profile.httpProfile.reqMethod,
self._requestPath,
header=headers)
self._build_req_inter(action, params, req, options)

# apigw_endpoint explicitly specified by user, skip domain switching
if self.profile.httpProfile.apigw_endpoint:
req = RequestInternal(self._get_endpoint(options=options),
self.profile.httpProfile.reqMethod,
self._requestPath,
header=headers)
self._build_req_inter(action, params, req, options)
req.host = self.profile.httpProfile.apigw_endpoint
req.header["Host"] = req.host
return self.request.send_request(req)
return self.request.send_request(req)

origin_endpoint = self._get_endpoint(options=options)
return self._call_with_domain_failover(origin_endpoint, action, params, options, headers)

def _call_with_domain_failover(self, origin_endpoint, action, params, options, headers):
"""Try sequentially in candidate domain order, switch to next candidate upon first switchable exception.

Each candidate carries independent circuit breaker; any success resets the failure count for that candidate.
All candidates fail, throw the last TencentCloudSDKException (exception chain preserved).
"""
usable = self.domain_failover.iter_available_candidates(origin_endpoint)
last_err = None

for idx, (cand_host, breaker, generation) in enumerate(usable):
# Each candidate needs to reconstruct req and resign (because Host changes, TC3 signature
# `host:` also needs to change). Note headers is an external dictionary, to avoid signature residue
# polluting the next candidate, deep copy here.
cand_headers = dict(headers)
req = RequestInternal(cand_host,
self.profile.httpProfile.reqMethod,
self._requestPath,
header=cand_headers)
self._build_req_inter(action, params, req, options)
# Override Host to ensure even old signature versions (HmacSHA1/256) without Host set can work
req.header["Host"] = cand_host

# ProxyConnection.request_host will serve as fallback for setdefault("Host") during request;
# To ensure HTTP layer also sees correct Host, synchronize here (does not affect rootDomain configuration).
prev_request_host = self.request.conn.request_host
self.request.conn.request_host = cand_host
try:
resp = self.request.send_request(req)
breaker.after_requests(generation, True)
return resp
except TencentCloudSDKException as e:
kind = _classify_exception(e)
if is_failover_triggered(kind):
# Trigger switch: report failure and try next candidate
breaker.after_requests(generation, False)
last_err = e
logger.debug(
"domain_failover: candidate=%s kind=%s err=%s, try next",
cand_host, kind, e)
continue
# Non-network exceptions: no switch, throw directly; doesn't affect circuit breaker count (avoid business error pollution)
raise
finally:
self.request.conn.request_host = prev_request_host

# All candidates failed: throw the last exception (exception chain preserved via `raise ... from e`)
if last_err is not None:
raise last_err
# Theoretically shouldn't reach here
raise TencentCloudSDKException("ClientNetworkError", "all failover candidates failed")

def call(self, action, params, options=None, headers=None):

Expand Down
231 changes: 231 additions & 0 deletions tencentcloud/common/domain_failover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# -*- coding: utf-8 -*-
#
# Copyright 2017-2026 Tencent Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
"""
Domain-level failover switching module.

When SDK-initiated requests hit DNS/TCP/TLS class failures (see `tests/dns_failure_test/
DNS_FAILURE_SDK_EXCEPTION_ANALYSIS.md` for details), this module retries sequentially
in the order "primary domain → .com.cn → .cn" and maintains an independent CircuitBreaker
for each candidate domain.

Rules:
- *.tencentcloudapi.com -> *.tencentcloudapi.com.cn -> *.tencentcloudapi.cn
- *.{region}.tencentcloudapi.com -> *.{region}.tencentcloudapi.com.cn -> *.{region}.tencentcloudapi.cn
- *.internal.tencentcloudapi.com -> Follow general rules for switching
- *.intl.tencentcloudapi.com -> No switching (international site)
"""
import json
import logging
import socket
import threading

try:
import ssl as _ssl
except ImportError: # pragma: no cover
_ssl = None

from tencentcloud.common.circuit_breaker import CircuitBreaker
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException

logger = logging.getLogger("tencentcloud_sdk_common")

# Primary domain root → backup candidate roots (in priority order)
_FAILOVER_SUFFIX_RULES = [
("tencentcloudapi.com", ["tencentcloudapi.com.cn", "tencentcloudapi.cn"]),
]

# International site domain suffix: strict match, no switching
_INTL_SUFFIX = ".intl.tencentcloudapi.com"


class _InternalBreakerSetting(object):
"""Circuit breaker thresholds for domain failover (completely internal constants, not exposed to users).

Field names are consistent with RegionBreakerProfile to reuse existing CircuitBreaker implementation.
Each candidate domain's CircuitBreaker holds an independent setting instance to avoid mutual interference.
"""

def __init__(self):
self.max_fail_num = 5
self.max_fail_percent = 0.75
self.window_interval = 60 * 5 # Accumulated window within 5 minutes
self.timeout = 60 # Enter HALF_OPEN after 60s in OPEN state
self.max_requests = 5 # Return to CLOSED after accumulating 5 successes in HALF_OPEN


def _classify_exception(exc):
"""Identify the original exception type along the __cause__ / __context__ chain, return the kind that can trigger domain switching.

Return values:
- "DNS_NXDOMAIN" / "DNS_TIMEOUT" -> Class A DNS failure
- "TCP_CONN_REFUSED" -> Class B connection refused
- "TCP_READ_TIMEOUT" -> Class B read timeout
- "TLS_ERROR" -> Class C certificate error
- "JSON_DECODE_ERROR" -> Class C JSON parsing failure (no switching)
- None -> Non-network class exception (no switching)
"""
# JSONDecodeError wrapper at business method level
if isinstance(exc, TencentCloudSDKException) and exc.get_code() == "JSONDecodeError":
return "JSON_DECODE_ERROR"

# Find the original exception along the exception chain
raw = None
if isinstance(exc, TencentCloudSDKException):
raw = exc.__cause__ or exc.__context__
else:
raw = exc
if raw is None:
return None

# Walk to the end of the chain
root = raw
seen = set()
while True:
nxt = getattr(root, "__cause__", None) or getattr(root, "__context__", None)
if nxt is None or id(nxt) in seen:
break
seen.add(id(root))
root = nxt

# Lazy import requests to avoid affecting call paths not using http
try:
import requests
req_conn_err = requests.exceptions.ConnectionError
req_read_timeout = requests.exceptions.ReadTimeout
req_connect_timeout = requests.exceptions.ConnectTimeout
req_ssl_error = requests.exceptions.SSLError
except ImportError: # pragma: no cover
req_conn_err = req_read_timeout = req_connect_timeout = req_ssl_error = ()

# TLS error
if req_ssl_error and isinstance(raw, req_ssl_error):
return "TLS_ERROR"
if _ssl is not None and isinstance(root, _ssl.SSLError):
return "TLS_ERROR"

# Read timeout
if req_read_timeout and isinstance(raw, req_read_timeout):
return "TCP_READ_TIMEOUT"
if isinstance(root, socket.timeout):
return "TCP_READ_TIMEOUT"

# Connection timeout
if req_connect_timeout and isinstance(raw, req_connect_timeout):
return "TCP_READ_TIMEOUT"

# Connection refused (including DNS returning 0.0.0.0 / hijacked to non-service IP)
if isinstance(root, ConnectionRefusedError):
return "TCP_CONN_REFUSED"

# DNS resolution failure
if isinstance(root, socket.gaierror):
errno = getattr(root, "errno", None)
# EAI_AGAIN = -3 on glibc, 11002 on Windows → mostly DNS timeout
if errno in (socket.EAI_AGAIN, -3, 11002):
return "DNS_TIMEOUT"
return "DNS_NXDOMAIN"

# Other ConnectionError (fallback also triggers switching to avoid missed judgments)
if req_conn_err and isinstance(raw, req_conn_err):
return "DNS_NXDOMAIN"

return None


def is_failover_triggered(kind):
"""Whether the kind triggers domain switching. JSON_DECODE_ERROR and None do not trigger."""
return kind in ("DNS_NXDOMAIN", "DNS_TIMEOUT",
"TCP_CONN_REFUSED", "TCP_READ_TIMEOUT", "TLS_ERROR")


def _split_host_suffix(host):
"""Split host by known suffixes like "tencentcloudapi.com" into (prefix, matched_suffix).
Returns (None, None) if no supported suffix is matched.
"""
if not host:
return None, None
for suffix, _ in _FAILOVER_SUFFIX_RULES:
if host == suffix or host.endswith("." + suffix):
prefix = host[: -len(suffix)] # including the trailing '.' (or empty string)
return prefix, suffix
return None, None


def build_candidates(host):
"""Construct candidate domain sequence based on original host, with the host itself always as the first item.

If host matches `*.intl.tencentcloudapi.com`, returns `[host]` (no switching).
If host doesn't match any supported suffix (e.g., user-defined endpoint / ip), also returns `[host]`.
"""
if not host:
return [host]

# International sites do not switch
if host == _INTL_SUFFIX.lstrip(".") or host.endswith(_INTL_SUFFIX):
return [host]

prefix, suffix = _split_host_suffix(host)
if suffix is None:
return [host]

candidates = [host]
for alt in dict(_FAILOVER_SUFFIX_RULES)[suffix]:
candidates.append(prefix + alt)
return candidates


class DomainFailoverManager(object):
"""Container for maintaining circuit breakers by candidate domain dimension.

Lifecycle: AbstractClient holds one instance; CircuitBreaker is dynamically created
when each candidate domain first appears. Not shared between different client instances
(consistent with the scope of existing region_breaker).

This manager is an internal SDK component, completely transparent to users: no switches exposed,
no thresholds exposed, always active. Only when host doesn't match `*.tencentcloudapi.com` family
(e.g., intl domains, custom endpoints, IPs) is equivalent to "no switching", with behavior
completely consistent with before the modification.
"""

def __init__(self):
self._breakers = {}
self._lock = threading.Lock()

def get_breaker(self, host):
with self._lock:
br = self._breakers.get(host)
if br is None:
br = CircuitBreaker(_InternalBreakerSetting())
self._breakers[host] = br
return br

def iter_available_candidates(self, host):
"""Return (candidate_host, breaker, generation) in order.

- If circuit breaker is OPEN, skip that candidate; if all are OPEN, downgrade to "still try primary domain"
to avoid all traffic being rejected (consistent with existing region_breaker behavior).
- Caller is responsible for calling breaker.after_requests(generation, success) to write back results.
"""
candidates = build_candidates(host)
usable = []
for c in candidates:
br = self.get_breaker(c)
generation, need_skip = br.before_requests()
if need_skip:
logger.debug("domain_failover: skip %s (breaker open)", c)
continue
usable.append((c, br, generation))

if not usable:
# All circuit breakers are OPEN, give one more chance in this case, choose primary domain
br = self.get_breaker(candidates[0])
generation, _ = br.before_requests()
usable.append((candidates[0], br, generation))
return usable
11 changes: 10 additions & 1 deletion tencentcloud/common/http/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,17 @@ def send_request(self, req_inter):
http_resp = self._request(req_inter)
self.request_size = self.conn.request_length
return http_resp
except TencentCloudSDKException:
# Already an SDK exception (e.g., ClientParamsError thrown in _request), throw as-is to avoid double wrapping
raise
except Exception as e:
raise TencentCloudSDKException("ClientNetworkError", str(e))
# Keep the original ClientNetworkError shell for compatibility with retry mechanism (StandardRetryer depends on this error code),
# while using PEP 3134 standard `raise ... from e` to establish exception chain, allowing upper layers to
# directly access the original exception via e.__cause__ (such as requests.exceptions.ConnectionError /
# ReadTimeout / SSLError, etc.), and trace along __cause__ to reach the end socket.gaierror,
# ConnectionRefusedError, socket.timeout, CertificateError, etc., thus enabling
# fine-grained failover judgment for DNS/network failures.
raise TencentCloudSDKException("ClientNetworkError", str(e)) from e


class RequestInternal(object):
Expand Down
1 change: 0 additions & 1 deletion tencentcloud/common/profile/client_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def __init__(self, signMethod=None, httpProfile=None, language="zh-CN",

self.retryer = retryer


class RegionBreakerProfile(object):
"""RegionBreaker profile.

Expand Down
Loading