1
0
Fork 0
mirror of synced 2024-06-02 18:34:37 +12:00
bulk-downloader-for-reddit/bdfr/download_filter.py

45 lines
1.4 KiB
Python
Raw Normal View History

2021-02-07 23:23:08 +13:00
#!/usr/bin/env python3
# coding=utf-8
2021-03-11 16:20:39 +13:00
import logging
2021-02-07 23:23:08 +13:00
import re
2021-03-11 16:20:39 +13:00
logger = logging.getLogger(__name__)
2021-02-07 23:23:08 +13:00
class DownloadFilter:
def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None):
self.excluded_extensions = excluded_extensions
self.excluded_domains = excluded_domains
def check_url(self, url: str) -> bool:
"""Return whether a URL is allowed or not"""
if not self._check_extension(url):
return False
elif not self._check_domain(url):
return False
else:
return True
def _check_extension(self, url: str) -> bool:
if not self.excluded_extensions:
return True
combined_extensions = '|'.join(self.excluded_extensions)
pattern = re.compile(r'.*({})$'.format(combined_extensions))
if re.match(pattern, url):
2021-03-11 16:20:39 +13:00
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
2021-02-07 23:23:08 +13:00
return False
else:
return True
def _check_domain(self, url: str) -> bool:
if not self.excluded_domains:
return True
combined_domains = '|'.join(self.excluded_domains)
pattern = re.compile(r'https?://.*({}).*'.format(combined_domains))
if re.match(pattern, url):
2021-03-11 16:20:39 +13:00
logger.log(9, f'Url "{url}" matched with "{str(pattern)}"')
2021-02-07 23:23:08 +13:00
return False
else:
return True