From 2326e00bc43d61e18a5ba49e22d00da0b04c3693 Mon Sep 17 00:00:00 2001
From: Matthew Hodgson <matthew@arasphere.net>
Date: Mon, 11 Mar 2019 10:53:45 +0100
Subject: [PATCH] fix incorrect encoding of filenames with spaces in (#2090)

fixes https://github.com/vector-im/riot-web/issues/3155
---
 changelog.d/2090.bugfix        |  1 +
 synapse/rest/media/v1/_base.py | 54 ++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 changelog.d/2090.bugfix
diff --git a/changelog.d/2090.bugfix b/changelog.d/2090.bugfix
new file mode 100644
index 0000000000..de2d22fcb8
--- /dev/null
+++ b/changelog.d/2090.bugfix
@@ -0,0 +1 @@
+Fix a bug where media with spaces in the name would get a corrupted name.
diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py
index fece1ef0b8..953d89bd82 100644
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@@ -100,10 +100,29 @@ def add_file_headers(request, media_type, file_size, upload_name):
 
     request.setHeader(b"Content-Type", media_type.encode("UTF-8"))
     if upload_name:
-        if is_ascii(upload_name):
-            disposition = "inline; filename=%s" % (_quote(upload_name),)
+        # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
+        #
+        # `filename` is defined to be a `value`, which is defined by RFC2616
+        # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
+        # is (essentially) a single US-ASCII word, and a `quoted-string` is a
+        # US-ASCII string surrounded by double-quotes, using backslash as an
+        # escape charater. Note that %-encoding is *not* permitted.
+        #
+        # `filename*` is defined to be an `ext-value`, which is defined in
+        # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
+        # where `value-chars` is essentially a %-encoded string in the given charset.
+        #
+        # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
+        # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
+        # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
+
+        # We avoid the quoted-string version of `filename`, because (a) synapse didn't
+        # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
+        # may as well just do the filename* version.
+        if _can_encode_filename_as_token(upload_name):
+            disposition = 'inline; filename=%s' % (upload_name, )
         else:
-            disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
+            disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name), )
 
         request.setHeader(b"Content-Disposition", disposition.encode('ascii'))
 
@@ -116,6 +135,35 @@ def add_file_headers(request, media_type, file_size, upload_name):
     request.setHeader(b"Content-Length", b"%d" % (file_size,))
 
 
+# separators as defined in RFC2616. SP and HT are handled separately.
+# see _can_encode_filename_as_token.
+_FILENAME_SEPARATOR_CHARS = set((
+    "(", ")", "<", ">", "@", ",", ";", ":", "\\", '"',
+    "/", "[", "]", "?", "=", "{", "}",
+))
+
+
+def _can_encode_filename_as_token(x):
+    for c in x:
+        # from RFC2616:
+        #
+        #        token          = 1*<any CHAR except CTLs or separators>
+        #
+        #        separators     = "(" | ")" | "<" | ">" | "@"
+        #                       | "," | ";" | ":" | "\" | <">
+        #                       | "/" | "[" | "]" | "?" | "="
+        #                       | "{" | "}" | SP | HT
+        #
+        #        CHAR           = <any US-ASCII character (octets 0 - 127)>
+        #
+        #        CTL            = <any US-ASCII control character
+        #                         (octets 0 - 31) and DEL (127)>
+        #
+        if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
+            return False
+    return True
+
+
 @defer.inlineCallbacks
 def respond_with_responder(request, responder, media_type, file_size, upload_name=None):
     """Responds to the request with given responder. If responder is None then