Refactor request sending to have better excpetions (#4358)

* Correctly retry and back off if we get a HTTPerror response

* Refactor request sending to have better excpetions

MatrixFederationHttpClient blindly reraised exceptions to the caller
without differentiating "expected" failures (e.g. connection timeouts
etc) versus more severe problems (e.g. programming errors).

This commit adds a RequestSendFailed exception that is raised when
"expected" failures happen, allowing the TransactionQueue to log them as
warnings while allowing us to log other exceptions as actual exceptions.
This commit is contained in:
Erik Johnston 2019-01-08 11:04:28 +00:00 committed by Richard van der Hoff
parent 1dcb086f33
commit b970cb0e96
6 changed files with 119 additions and 44 deletions

1
changelog.d/4358.misc Normal file
View file

@ -0,0 +1 @@
Add better logging for unexpected errors while sending transactions

View file

@ -348,6 +348,24 @@ class IncompatibleRoomVersionError(SynapseError):
)
class RequestSendFailed(RuntimeError):
"""Sending a HTTP request over federation failed due to not being able to
talk to the remote server for some reason.
This exception is used to differentiate "expected" errors that arise due to
networking (e.g. DNS failures, connection timeouts etc), versus unexpected
errors (like programming errors).
"""
def __init__(self, inner_exception, can_retry):
super(RequestSendFailed, self).__init__(
"Failed to send request: %s: %s" % (
type(inner_exception).__name__, inner_exception,
)
)
self.inner_exception = inner_exception
self.can_retry = can_retry
def cs_error(msg, code=Codes.UNKNOWN, **kwargs):
""" Utility method for constructing an error response for client-server
interactions.

View file

@ -22,7 +22,11 @@ from prometheus_client import Counter
from twisted.internet import defer
import synapse.metrics
from synapse.api.errors import FederationDeniedError, HttpResponseException
from synapse.api.errors import (
FederationDeniedError,
HttpResponseException,
RequestSendFailed,
)
from synapse.handlers.presence import format_user_presence_state, get_interested_remotes
from synapse.metrics import (
LaterGauge,
@ -518,11 +522,16 @@ class TransactionQueue(object):
)
except FederationDeniedError as e:
logger.info(e)
except Exception as e:
logger.warn(
"TX [%s] Failed to send transaction: %s",
except RequestSendFailed as e:
logger.warning("(TX [%s] Failed to send transaction: %s", destination, e)
for p, _ in pending_pdus:
logger.info("Failed to send event %s to %s", p.event_id,
destination)
except Exception:
logger.exception(
"TX [%s] Failed to send transaction",
destination,
e,
)
for p, _ in pending_pdus:
logger.info("Failed to send event %s to %s", p.event_id,

View file

@ -19,7 +19,7 @@ import random
import sys
from io import BytesIO
from six import PY3, string_types
from six import PY3, raise_from, string_types
from six.moves import urllib
import attr
@ -41,6 +41,7 @@ from synapse.api.errors import (
Codes,
FederationDeniedError,
HttpResponseException,
RequestSendFailed,
SynapseError,
)
from synapse.http.endpoint import matrix_federation_endpoint
@ -231,7 +232,7 @@ class MatrixFederationHttpClient(object):
Deferred: resolves with the http response object on success.
Fails with ``HttpResponseException``: if we get an HTTP response
code >= 300.
code >= 300 (except 429).
Fails with ``NotRetryingDestination`` if we are not yet ready
to retry this server.
@ -239,8 +240,8 @@ class MatrixFederationHttpClient(object):
Fails with ``FederationDeniedError`` if this destination
is not on our federation whitelist
(May also fail with plenty of other Exceptions for things like DNS
failures, connection failures, SSL failures.)
Fails with ``RequestSendFailed`` if there were problems connecting to
the remote, due to e.g. DNS failures, connection timeouts etc.
"""
if timeout:
_sec_timeout = timeout / 1000
@ -335,23 +336,74 @@ class MatrixFederationHttpClient(object):
reactor=self.hs.get_reactor(),
)
with Measure(self.clock, "outbound_request"):
response = yield make_deferred_yieldable(
request_deferred,
try:
with Measure(self.clock, "outbound_request"):
response = yield make_deferred_yieldable(
request_deferred,
)
except DNSLookupError as e:
raise_from(RequestSendFailed(e, can_retry=retry_on_dns_fail), e)
except Exception as e:
raise_from(RequestSendFailed(e, can_retry=True), e)
logger.info(
"{%s} [%s] Got response headers: %d %s",
request.txn_id,
request.destination,
response.code,
response.phrase.decode('ascii', errors='replace'),
)
if 200 <= response.code < 300:
pass
else:
# :'(
# Update transactions table?
d = treq.content(response)
d = timeout_deferred(
d,
timeout=_sec_timeout,
reactor=self.hs.get_reactor(),
)
try:
body = yield make_deferred_yieldable(d)
except Exception as e:
# Eh, we're already going to raise an exception so lets
# ignore if this fails.
logger.warn(
"{%s} [%s] Failed to get error response: %s %s: %s",
request.txn_id,
request.destination,
request.method,
url_str,
_flatten_response_never_received(e),
)
body = None
e = HttpResponseException(
response.code, response.phrase, body
)
# Retry if the error is a 429 (Too Many Requests),
# otherwise just raise a standard HttpResponseException
if response.code == 429:
raise_from(RequestSendFailed(e, can_retry=True), e)
else:
raise e
break
except Exception as e:
except RequestSendFailed as e:
logger.warn(
"{%s} [%s] Request failed: %s %s: %s",
request.txn_id,
request.destination,
request.method,
url_str,
_flatten_response_never_received(e),
_flatten_response_never_received(e.inner_exception),
)
if not retry_on_dns_fail and isinstance(e, DNSLookupError):
if not e.can_retry:
raise
if retries_left and not timeout:
@ -376,29 +428,16 @@ class MatrixFederationHttpClient(object):
else:
raise
logger.info(
"{%s} [%s] Got response headers: %d %s",
request.txn_id,
request.destination,
response.code,
response.phrase.decode('ascii', errors='replace'),
)
if 200 <= response.code < 300:
pass
else:
# :'(
# Update transactions table?
d = treq.content(response)
d = timeout_deferred(
d,
timeout=_sec_timeout,
reactor=self.hs.get_reactor(),
)
body = yield make_deferred_yieldable(d)
raise HttpResponseException(
response.code, response.phrase, body
)
except Exception as e:
logger.warn(
"{%s} [%s] Request failed: %s %s: %s",
request.txn_id,
request.destination,
request.method,
url_str,
_flatten_response_never_received(e),
)
raise
defer.returnValue(response)

View file

@ -30,6 +30,7 @@ from synapse.api.errors import (
FederationDeniedError,
HttpResponseException,
NotFoundError,
RequestSendFailed,
SynapseError,
)
from synapse.metrics.background_process_metrics import run_as_background_process
@ -372,10 +373,10 @@ class MediaRepository(object):
"allow_remote": "false",
}
)
except twisted.internet.error.DNSLookupError as e:
logger.warn("HTTP error fetching remote media %s/%s: %r",
except RequestSendFailed as e:
logger.warn("Request failed fetching remote media %s/%s: %r",
server_name, media_id, e)
raise NotFoundError()
raise SynapseError(502, "Failed to fetch remote media")
except HttpResponseException as e:
logger.warn("HTTP error fetching remote media %s/%s: %s",

View file

@ -20,6 +20,7 @@ from twisted.internet.error import ConnectingCancelledError, DNSLookupError
from twisted.web.client import ResponseNeverReceived
from twisted.web.http import HTTPChannel
from synapse.api.errors import RequestSendFailed
from synapse.http.matrixfederationclient import (
MatrixFederationHttpClient,
MatrixFederationRequest,
@ -49,7 +50,8 @@ class FederationClientTests(HomeserverTestCase):
self.pump()
f = self.failureResultOf(d)
self.assertIsInstance(f.value, DNSLookupError)
self.assertIsInstance(f.value, RequestSendFailed)
self.assertIsInstance(f.value.inner_exception, DNSLookupError)
def test_client_never_connect(self):
"""
@ -76,7 +78,11 @@ class FederationClientTests(HomeserverTestCase):
self.reactor.advance(10.5)
f = self.failureResultOf(d)
self.assertIsInstance(f.value, (ConnectingCancelledError, TimeoutError))
self.assertIsInstance(f.value, RequestSendFailed)
self.assertIsInstance(
f.value.inner_exception,
(ConnectingCancelledError, TimeoutError),
)
def test_client_connect_no_response(self):
"""
@ -107,7 +113,8 @@ class FederationClientTests(HomeserverTestCase):
self.reactor.advance(10.5)
f = self.failureResultOf(d)
self.assertIsInstance(f.value, ResponseNeverReceived)
self.assertIsInstance(f.value, RequestSendFailed)
self.assertIsInstance(f.value.inner_exception, ResponseNeverReceived)
def test_client_gets_headers(self):
"""