Update bleach clean function (#3503)

* Update bleach clean function

- Invalid tags are stripped out
- & > < characters are accepted

* Throw an error if any field contains HTML tags

* Update unit tests
This commit is contained in:
Oliver 2022-08-15 22:27:25 +10:00 committed by GitHub
parent 956701a584
commit b0e91e7068
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 82 additions and 27 deletions

View File

@ -1,15 +1,18 @@
"""Mixins for (API) views in the whole project."""
from django.utils.translation import gettext_lazy as _
from bleach import clean
from rest_framework import generics, status
from rest_framework.exceptions import ValidationError
from rest_framework.response import Response
class CleanMixin():
"""Model mixin class which cleans inputs."""
"""Model mixin class which cleans inputs using the Mozilla bleach tools."""
# Define a map of fields avaialble for import
SAFE_FIELDS = {}
# Define a list of field names which will *not* be cleaned
SAFE_FIELDS = []
def create(self, request, *args, **kwargs):
"""Override to clean data before processing it."""
@ -34,6 +37,42 @@ class CleanMixin():
return Response(serializer.data)
def clean_string(self, field: str, data: str) -> str:
"""Clean / sanitize a single input string.
Note that this function will *allow* orphaned <>& characters,
which would normally be escaped by bleach.
Nominally, the only thing that will be "cleaned" will be HTML tags
Ref: https://github.com/mozilla/bleach/issues/192
"""
cleaned = clean(
data,
strip=True,
tags=[],
attributes=[],
)
# Add escaped characters back in
replacements = {
'&gt;': '>',
'&lt;': '<',
'&amp;': '&',
}
for o, r in replacements.items():
cleaned = cleaned.replace(o, r)
# If the length changed, it means that HTML tags were removed!
if len(cleaned) != len(data):
raise ValidationError({
field: [_("Remove HTML tags from this value")]
})
return cleaned
def clean_data(self, data: dict) -> dict:
"""Clean / sanitize data.
@ -46,17 +85,24 @@ class CleanMixin():
data (dict): Data that should be sanatized.
Returns:
dict: Profided data sanatized; still in the same order.
dict: Provided data sanatized; still in the same order.
"""
clean_data = {}
for k, v in data.items():
if isinstance(v, str):
ret = clean(v)
if k in self.SAFE_FIELDS:
ret = v
elif isinstance(v, str):
ret = self.clean_string(k, v)
elif isinstance(v, dict):
ret = self.clean_data(v)
else:
ret = v
clean_data[k] = ret
return clean_data

View File

@ -227,31 +227,40 @@ class PartCategoryAPITest(InvenTreeAPITestCase):
url = reverse('api-part-category-detail', kwargs={'pk': 1})
self.patch(
url,
{
'description': '<img src=# onerror=alert("pwned")>',
},
expected_code=200
)
# Invalid values containing tags
invalid_values = [
'<img src="test"/>',
'<a href="#">Link</a>',
"<a href='#'>Link</a>",
'<b>',
]
cat = PartCategory.objects.get(pk=1)
for v in invalid_values:
response = self.patch(
url,
{
'description': v
},
expected_code=400
)
# Image tags have been stripped
self.assertEqual(cat.description, '&lt;img src=# onerror=alert("pwned")&gt;')
# Raw characters should be allowed
allowed = [
'<< hello',
'Alpha & Omega',
'A > B > C',
]
self.patch(
url,
{
'description': '<a href="www.google.com">LINK</a><script>alert("h4x0r")</script>',
},
expected_code=200,
)
for val in allowed:
response = self.patch(
url,
{
'description': val,
},
expected_code=200,
)
# Tags must have been bleached out
cat.refresh_from_db()
self.assertEqual(cat.description, '<a href="www.google.com">LINK</a>&lt;script&gt;alert("h4x0r")&lt;/script&gt;')
self.assertEqual(response.data['description'], val)
class PartOptionsAPITest(InvenTreeAPITestCase):