1
0
Fork 0
mirror of synced 2024-06-28 19:10:41 +12:00

Move to inheritance system for downloaders

This commit is contained in:
Serene-Arc 2021-02-07 14:46:20 +10:00 committed by Ali Parlakci
parent 69e21e46a2
commit f573038a21
11 changed files with 253 additions and 280 deletions

View file

@ -1,15 +1,18 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
import hashlib import hashlib
import os import logging
import sys import re
import urllib.request from abc import ABC, abstractmethod
from abc import ABC
from pathlib import Path from pathlib import Path
import requests
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
logger = logging.getLogger(__name__)
class BaseDownloader(ABC): class BaseDownloader(ABC):
@ -17,22 +20,17 @@ class BaseDownloader(ABC):
self.directory = directory self.directory = directory
self.post = post self.post = post
@abstractmethod
def download(self):
raise NotImplementedError
@staticmethod @staticmethod
def createHash(filename: str) -> str: def _create_hash(content: bytes) -> str:
hash_md5 = hashlib.md5() hash_md5 = hashlib.md5(content)
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest() return hash_md5.hexdigest()
@staticmethod @staticmethod
def getFile( def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
filename: str,
short_filename: str,
folder_dir: Path,
image_url: str,
indent: int = 0,
silent: bool = False):
formats = { formats = {
"videos": [".mp4", ".webm"], "videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"], "images": [".jpg", ".jpeg", ".png", ".bmp"],
@ -52,69 +50,55 @@ class BaseDownloader(ABC):
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"), "Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;" ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"), ("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"), ("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive") ("Connection", "keep-alive")
] ]
if not os.path.exists(folder_dir): folder_dir.mkdir(exist_ok=True)
os.makedirs(folder_dir)
opener = urllib.request.build_opener()
if "imgur" not in image_url: if "imgur" not in image_url:
opener.addheaders = headers addheaders = headers
urllib.request.install_opener(opener) else:
addheaders = None
if not silent: if not silent:
print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
def dlProgress(count: int, block_size: int, total_size: int):
"""Function for writing download progress to console """
download_mbs = int(count * block_size * (10 ** (-6)))
file_size = int(total_size * (10 ** (-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size))
sys.stdout.flush()
# Loop to attempt download 3 times
for i in range(3): for i in range(3):
file_dir = Path(folder_dir) / filename file_path = Path(folder_dir) / filename
temp_dir = Path(folder_dir) / (filename + ".tmp")
if not (os.path.isfile(file_dir)): if file_path.is_file():
raise FileAlreadyExistsError
else:
try: try:
urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress) download_content = requests.get(image_url, headers=addheaders).content
file_hash = BaseDownloader.createHash(temp_dir)
if GLOBAL.arguments.no_dupes:
if file_hash in GLOBAL.downloadedPosts():
os.remove(temp_dir)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
os.rename(temp_dir, file_dir)
if not silent:
print(" " * indent + "Downloaded" + " " * 10)
return None
except ConnectionResetError: except ConnectionResetError:
raise FailedToDownload raise FailedToDownload
except FileNotFoundError:
filename = short_filename file_hash = BaseDownloader._create_hash(download_content)
else: if GLOBAL.arguments.no_dupes:
raise FileAlreadyExistsError if file_hash in GLOBAL.downloadedPosts():
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
with open(file_path, 'wb') as file:
file.write(download_content)
if not silent:
logger.info(" " * indent + "Downloaded" + " " * 10)
return
raise FailedToDownload raise FailedToDownload
@staticmethod @staticmethod
def getExtension(link: str): def _get_extension(url: str) -> str:
"""Extract file extension from image link. If didn't find any, return '.jpg' """ pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))')
image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] if len(results := re.search(pattern, url).groups()) > 1:
parsed = link.split('.') return results[1]
for fileType in image_types: if "v.redd.it" not in url:
if fileType in parsed: return '.jpg'
return "." + parsed[-1]
else: else:
if "v.redd.it" not in link: return '.mp4'
return '.jpg'
else:
return '.mp4'

View file

@ -1,4 +1,5 @@
import os #!/usr/bin/env python3
import pathlib import pathlib
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
@ -8,11 +9,11 @@ from bulkredditdownloader.utils import GLOBAL
class Direct(BaseDownloader): class Direct(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
post['EXTENSION'] = self.getExtension(post['CONTENTURL']) self.download()
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] def download(self):
short_filename = post['POSTID'] + post['EXTENSION'] self.post['EXTENSION'] = self._get_extension(self.post['CONTENTURL'])
self.directory.mkdir(exist_ok=True)
self.getFile(filename, short_filename, directory, post['CONTENTURL']) filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
self._download_resource(pathlib.Path(filename), self.directory, self.post['CONTENTURL'])

View file

@ -1,5 +1,8 @@
import os #!/usr/bin/env python3
import logging
import pathlib import pathlib
import re
import urllib.error import urllib.error
import urllib.request import urllib.request
from html.parser import HTMLParser from html.parser import HTMLParser
@ -7,70 +10,64 @@ from html.parser import HTMLParser
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
logger = logging.getLogger(__name__)
class Erome(BaseDownloader): class Erome(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
self.download()
def download(self):
try: try:
images = self.getLinks(post['CONTENTURL']) images = self._get_links(self.post['CONTENTURL'])
except urllib.error.HTTPError: except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link") raise NotADownloadableLinkError("Not a downloadable link")
images_length = len(images) images_length = len(images)
how_many_downloaded = images_length how_many_downloaded = len(images)
duplicates = 0 duplicates = 0
if images_length == 1: if images_length == 1:
extension = self.getExtension(images[0])
"""Filenames are declared here""" """Filenames are declared here"""
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
short_filename = post['POSTID'] + extension
image_url = images[0] image = images[0]
if 'https://' not in image_url or 'http://' not in image_url: if not re.match(r'https?://.*', image):
image_url = "https://" + image_url image = "https://" + image
self.getFile(filename, short_filename, directory, image_url) self._download_resource(filename, self.directory, image)
else: else:
filename = GLOBAL.config['filename'].format(**post) filename = GLOBAL.config['filename'].format(**self.post)
print(filename) logger.info(filename)
folder_dir = directory / filename folder_dir = self.directory / filename
try: folder_dir.mkdir(exist_ok=True)
if not os.path.exists(folder_dir):
os.makedirs(folder_dir)
except FileNotFoundError:
folder_dir = directory / post['POSTID']
os.makedirs(folder_dir)
for i in range(images_length):
extension = self.getExtension(images[i])
for i, image in enumerate(images):
extension = self._get_extension(image)
filename = str(i + 1) + extension filename = str(i + 1) + extension
image_url = images[i]
if 'https://' not in image_url and 'http://' not in image_url:
image_url = "https://" + image_url
print(" ({}/{})".format(i + 1, images_length)) if not re.match(r'https?://.*', image):
print(" {}".format(filename)) image = "https://" + image
logger.info(" ({}/{})".format(i + 1, images_length))
logger.info(" {}".format(filename))
try: try:
self.getFile(filename, filename, folder_dir, image_url, indent=2) self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2)
print()
except FileAlreadyExistsError: except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n") logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1 duplicates += 1
how_many_downloaded -= 1 how_many_downloaded -= 1
except Exception as exception: except Exception as exception:
# raise exception # raise exception
print("\n Could not get the file") logger.error("\n Could not get the file")
print( logger.error(
" " " "
+ "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)) + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))
+ "\n" + "\n"
@ -82,10 +79,12 @@ class Erome(BaseDownloader):
elif how_many_downloaded + duplicates < images_length: elif how_many_downloaded + duplicates < images_length:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
def getLinks(self, url: str) -> list[str]: @staticmethod
def _get_links(url: str) -> list[str]:
content = [] content = []
line_number = None line_number = None
# TODO: move to bs4 and requests
class EromeParser(HTMLParser): class EromeParser(HTMLParser):
tag = None tag = None

View file

@ -1,7 +1,9 @@
#!/usr/bin/env python3
import json import json
import os
import pathlib import pathlib
import urllib import logging
import urllib.parse
import requests import requests
@ -9,15 +11,18 @@ from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound,
NotADownloadableLinkError, TypeInSkip) NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
logger = logging.getLogger(__name__)
class Gallery(BaseDownloader): class Gallery(BaseDownloader):
def __init__(self, directory: pathlib.Path, post): def __init__(self, directory: pathlib.Path, post):
super().__init__(directory, post) super().__init__(directory, post)
link = post['CONTENTURL'] link = self.post['CONTENTURL']
self.raw_data = self.getData(link) self.raw_data = self._get_data(link)
self.download()
def download(self):
images = {} images = {}
count = 0 count = 0
for model in self.raw_data['posts']['models']: for model in self.raw_data['posts']['models']:
@ -27,15 +32,15 @@ class Gallery(BaseDownloader):
images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts'] images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts']
['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']} ['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']}
count += 1 count += 1
except Exception: except KeyError:
continue continue
except Exception: except KeyError:
continue continue
self.downloadAlbum(images, count) self._download_album(images, count)
@staticmethod @staticmethod
def getData(link: str) -> dict: def _get_data(link: str) -> dict:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
@ -58,50 +63,42 @@ class Gallery(BaseDownloader):
data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1])
return data return data
def downloadAlbum(self, images: dict, count: int): def _download_album(self, images: dict, count: int):
folder_name = GLOBAL.config['filename'].format(**self.post) folder_name = GLOBAL.config['filename'].format(**self.post)
folder_dir = self.directory / folder_name folder_dir = self.directory / folder_name
how_many_downloaded = 0 how_many_downloaded = 0
duplicates = 0 duplicates = 0
try: folder_dir.mkdir(exist_ok=True)
if not os.path.exists(folder_dir): logger.info(folder_name)
os.makedirs(folder_dir)
except FileNotFoundError:
folder_dir = self.directory / self.post['POSTID']
os.makedirs(folder_dir)
print(folder_name) for i, image in enumerate(images):
path = urllib.parse.urlparse(image['url']).path
extension = pathlib.Path(path).suffix
for i in range(count): filename = pathlib.Path("_".join([str(i + 1), image['id']]) + extension)
path = urllib.parse.urlparse(images[i]['url']).path
extension = os.path.splitext(path)[1]
filename = "_".join([str(i + 1), images[i]['id']]) + extension logger.info("\n ({}/{})".format(i + 1, count))
short_filename = str(i + 1) + "_" + images[i]['id']
print("\n ({}/{})".format(i + 1, count))
try: try:
self.getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) self._download_resource(filename, folder_dir, image['url'], indent=2)
how_many_downloaded += 1 how_many_downloaded += 1
print()
except FileAlreadyExistsError: except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n") logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1 duplicates += 1
except TypeInSkip: except TypeInSkip:
print(" Skipping...") logger.info(" Skipping...")
how_many_downloaded += 1 how_many_downloaded += 1
except Exception as exception: except Exception as exception:
print("\n Could not get the file") logger.info("\n Could not get the file")
print(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( logger.info(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__, info=str(exception)) + "\n" class_name=exception.__class__.__name__, info=str(exception)) + "\n"
) )
print(GLOBAL.log_stream.getvalue(), no_print=True) logger.info(GLOBAL.log_stream.getvalue(), no_print=True)
if duplicates == count: if duplicates == count:
raise FileAlreadyExistsError raise FileAlreadyExistsError

View file

@ -1,43 +1,32 @@
#!/usr/bin/env python3
import json import json
import os import pathlib
import re
import urllib.request import urllib.request
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
import pathlib
class Gfycat(BaseDownloader): class Gfycat(GifDeliveryNetwork):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
try: self.download()
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
post['EXTENSION'] = self.getExtension(post['MEDIAURL']) def download(self):
super().download()
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
short_filename = post['POSTID'] + post['EXTENSION']
self.getFile(filename, short_filename, directory, post['MEDIAURL'])
@staticmethod @staticmethod
def getLink(url: str) -> str: def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source """Extract direct link to the video from page's source
and return it and return it
""" """
if '.webm' in url or '.mp4' in url or '.gif' in url: if re.match(r'\.(webm|mp4|gif)$', url):
return url return url
if url[-1:] == '/': if url.endswith('/'):
url = url[:-1] url = url[:-1]
url = "https://gfycat.com/" + url.split('/')[-1] url = "https://gfycat.com/" + url.split('/')[-1]
@ -49,6 +38,6 @@ class Gfycat(BaseDownloader):
content = soup.find("script", attrs=attributes) content = soup.find("script", attrs=attributes)
if content is None: if content is None:
return GifDeliveryNetwork.getLink(url) return super()._get_link(url)
return json.loads(content.contents[0])["video"]["contentUrl"] return json.loads(content.contents[0])["video"]["contentUrl"]

View file

@ -1,4 +1,5 @@
import os #!/usr/bin/env python3
import pathlib import pathlib
import urllib.request import urllib.request
@ -12,23 +13,23 @@ from bulkredditdownloader.utils import GLOBAL
class GifDeliveryNetwork(BaseDownloader): class GifDeliveryNetwork(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
self.download()
def download(self):
try: try:
post['MEDIAURL'] = self.getLink(post['CONTENTURL']) self.post['MEDIAURL'] = self._get_link(self.post['CONTENTURL'])
except IndexError: except IndexError:
raise NotADownloadableLinkError("Could not read the page source") raise NotADownloadableLinkError("Could not read the page source")
post['EXTENSION'] = self.getExtension(post['MEDIAURL']) self.post['EXTENSION'] = self._get_extension(self.post['MEDIAURL'])
self.directory.mkdir(exist_ok=True)
if not os.path.exists(directory): filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] self._download_resource(filename, self.directory, self.post['MEDIAURL'])
short_filename = post['POSTID'] + post['EXTENSION']
self.getFile(filename, short_filename, directory, post['MEDIAURL'])
@staticmethod @staticmethod
def getLink(url: str) -> str: def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source """Extract direct link to the video from page's source
and return it and return it
""" """

View file

@ -1,6 +1,8 @@
#!/usr/bin/env python3
import json import json
import os
import pathlib import pathlib
import logging
import requests import requests
@ -9,7 +11,8 @@ from bulkredditdownloader.downloaders.direct import Direct
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError,
ImageNotFound, NotADownloadableLinkError, TypeInSkip) ImageNotFound, NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL, nameCorrector from bulkredditdownloader.utils import GLOBAL, nameCorrector
from bulkredditdownloader.utils import printToFile as print
logger = logging.getLogger(__name__)
class Imgur(BaseDownloader): class Imgur(BaseDownloader):
@ -18,24 +21,28 @@ class Imgur(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
link = post['CONTENTURL'] self.raw_data = {}
self.download()
def download(self):
link = self.post['CONTENTURL']
if link.endswith(".gifv"): if link.endswith(".gifv"):
link = link.replace(".gifv", ".mp4") link = link.replace(".gifv", ".mp4")
Direct(directory, {**post, 'CONTENTURL': link}) Direct(self.directory, {**self.post, 'CONTENTURL': link})
return return
self.raw_data = self.getData(link) self.raw_data = self._get_data(link)
if self.isAlbum: if self._is_album:
if self.raw_data["album_images"]["count"] != 1: if self.raw_data["album_images"]["count"] != 1:
self.downloadAlbum(self.raw_data["album_images"]) self._download_album(self.raw_data["album_images"])
else: else:
self.download(self.raw_data["album_images"]["images"][0]) self._download_image(self.raw_data["album_images"]["images"][0])
else: else:
self.download(self.raw_data) self._download_image(self.raw_data)
def downloadAlbum(self, images: dict): def _download_album(self, images: dict):
folder_name = GLOBAL.config['filename'].format(**self.post) folder_name = GLOBAL.config['filename'].format(**self.post)
folder_dir = self.directory / folder_name folder_dir = self.directory / folder_name
@ -43,70 +50,60 @@ class Imgur(BaseDownloader):
how_many_downloaded = 0 how_many_downloaded = 0
duplicates = 0 duplicates = 0
try: folder_dir.mkdir(exist_ok=True)
if not os.path.exists(folder_dir): logger.info(folder_name)
os.makedirs(folder_dir)
except FileNotFoundError:
folder_dir = self.directory / self.post['POSTID']
os.makedirs(folder_dir)
print(folder_name)
for i in range(images_length): for i in range(images_length):
extension = self.validateExtension(images["images"][i]["ext"]) extension = self._validate_extension(images["images"][i]["ext"])
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
filename = "_".join([str(i + 1), filename = pathlib.Path("_".join([str(i + 1),
nameCorrector(images["images"][i]['title']), nameCorrector(images["images"][i]['title']),
images["images"][i]['hash']]) + extension images["images"][i]['hash']]) + extension)
short_filename = str(i + 1) + "_" + images["images"][i]['hash']
print("\n ({}/{})".format(i + 1, images_length)) logger.info("\n ({}/{})".format(i + 1, images_length))
try: try:
self.getFile(filename, short_filename, folder_dir, image_url, indent=2) self._download_resource(filename, folder_dir, image_url, indent=2)
how_many_downloaded += 1 how_many_downloaded += 1
print()
except FileAlreadyExistsError: except FileAlreadyExistsError:
print(" The file already exists" + " " * 10, end="\n\n") logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1 duplicates += 1
except TypeInSkip: except TypeInSkip:
print(" Skipping...") logger.info(" Skipping...")
how_many_downloaded += 1 how_many_downloaded += 1
except Exception as exception: except Exception as exception:
print("\n Could not get the file") logger.info("\n Could not get the file")
print( logger.info(
" " + " "
"{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__, class_name=exception.__class__.__name__,
info=str(exception) info=str(exception)
) )
+ "\n" + "\n"
) )
print(GLOBAL.log_stream.getvalue(), no_print=True) logger.info(GLOBAL.log_stream.getvalue(), no_print=True)
if duplicates == images_length: if duplicates == images_length:
raise FileAlreadyExistsError raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < images_length: elif how_many_downloaded + duplicates < images_length:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
def download(self, image: dict): def _download_image(self, image: dict):
extension = self.validateExtension(image["ext"]) extension = self._validate_extension(image["ext"])
image_url = self.imgur_image_domain + image["hash"] + extension image_url = self.imgur_image_domain + image["hash"] + extension
filename = GLOBAL.config['filename'].format(**self.post) + extension filename = GLOBAL.config['filename'].format(**self.post) + extension
short_filename = self.post['POSTID'] + extension
self.getFile(filename, short_filename, self.directory, image_url) self._download_resource(filename, self.directory, image_url)
@property def _is_album(self) -> bool:
def isAlbum(self) -> bool:
return "album_images" in self.raw_data return "album_images" in self.raw_data
@staticmethod @staticmethod
def getData(link: str) -> dict: def _get_data(link: str) -> dict:
cookies = {"over18": "1", "postpagebeta": "0"} cookies = {"over18": "1", "postpagebeta": "0"}
res = requests.get(link, cookies=cookies) res = requests.get(link, cookies=cookies)
if res.status_code != 200: if res.status_code != 200:
@ -128,18 +125,18 @@ class Imgur(BaseDownloader):
end_index -= 1 end_index -= 1
try: try:
data = page_source[start_index:end_index + 2].strip()[:-1] data = page_source[start_index:end_index + 2].strip()[:-1]
except Exception: except IndexError:
page_source[end_index + 1] = '}' page_source[end_index + 1] = '}'
data = page_source[start_index:end_index + 3].strip()[:-1] data = page_source[start_index:end_index + 3].strip()[:-1]
return json.loads(data) return json.loads(data)
@staticmethod @staticmethod
def validateExtension(string: str) -> str: def _validate_extension(extension_suffix: str) -> str:
possible_extensions = [".jpg", ".png", ".mp4", ".gif"] possible_extensions = [".jpg", ".png", ".mp4", ".gif"]
for extension in possible_extensions: for extension in possible_extensions:
if extension in string: if extension in extension_suffix:
return extension return extension
else: else:
raise ExtensionError(f"\"{string}\" is not recognized as a valid extension.") raise ExtensionError(f"\"{extension_suffix}\" is not recognized as a valid extension.")

View file

@ -1,35 +1,25 @@
#!/usr/bin/env python3
import json import json
import os
import pathlib import pathlib
import urllib.request import urllib.request
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork
from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
class Redgifs(BaseDownloader): class Redgifs(GifDeliveryNetwork):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
try: self.download()
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
post['EXTENSION'] = self.getExtension(post['MEDIAURL']) def download(self):
super().download()
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
short_filename = post['POSTID'] + post['EXTENSION']
self.getFile(filename, short_filename, directory, post['MEDIAURL'])
@staticmethod @staticmethod
def getLink(url: str) -> str: def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source """Extract direct link to the video from page's source
and return it and return it
""" """

View file

@ -1,45 +1,46 @@
from src.utils import printToFile as print #!/usr/bin/env python3
import io import io
import os import logging
import pathlib import pathlib
from pathlib import Path from pathlib import Path
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
VanillaPrint = print logger = logging.getLogger(__name__)
class SelfPost(BaseDownloader): class SelfPost(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
self.download()
def download(self):
if "self" in GLOBAL.arguments.skip: if "self" in GLOBAL.arguments.skip:
raise TypeInSkip raise TypeInSkip
if not os.path.exists(directory): self.directory.mkdir(exist_ok=True)
os.makedirs(directory) filename = GLOBAL.config['filename'].format(**self.post)
filename = GLOBAL.config['filename'].format(**post) file_dir = self.directory / (filename + ".md")
logger.info(file_dir)
file_dir = directory / (filename + ".md") logger.info(filename + ".md")
print(file_dir)
print(filename + ".md")
if Path.is_file(file_dir): if Path.is_file(file_dir):
raise FileAlreadyExistsError raise FileAlreadyExistsError
try: try:
self.writeToFile(file_dir, post) self._write_to_file(file_dir, self.post)
except FileNotFoundError: except FileNotFoundError:
file_dir = post['POSTID'] + ".md" file_dir = self.post['POSTID'] + ".md"
file_dir = directory / file_dir file_dir = self.directory / file_dir
self.writeToFile(file_dir, post) self._write_to_file(file_dir, self.post)
@staticmethod @staticmethod
def writeToFile(directory: pathlib.Path, post: dict): def _write_to_file(directory: pathlib.Path, post: dict):
"""Self posts are formatted here""" """Self posts are formatted here"""
content = ("## [" content = ("## ["
+ post["TITLE"] + post["TITLE"]
@ -59,5 +60,5 @@ class SelfPost(BaseDownloader):
+ ")") + ")")
with io.open(directory, "w", encoding="utf-8") as FILE: with io.open(directory, "w", encoding="utf-8") as FILE:
VanillaPrint(content, file=FILE) print(content, file=FILE)
print("Downloaded") logger.info("Downloaded")

View file

@ -1,51 +1,56 @@
#!/usr/bin/env python3
import logging
import os import os
import pathlib import pathlib
import subprocess import subprocess
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
logger = logging.getLogger(__name__)
class VReddit(BaseDownloader): class VReddit(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
extension = ".mp4" self.download()
if not os.path.exists(directory):
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) + extension def download(self):
short_filename = post['POSTID'] + extension extension = ".mp4"
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + extension
try: try:
fnull = open(os.devnull, 'w') fnull = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
except Exception: except Exception:
self.getFile(filename, short_filename, directory, post['CONTENTURL']) self._download_resource(filename, self.directory, self.post['CONTENTURL'])
print("FFMPEG library not found, skipping merging video and audio") logger.info("FFMPEG library not found, skipping merging video and audio")
else: else:
video_name = post['POSTID'] + "_video" video_name = self.post['POSTID'] + "_video"
video_url = post['CONTENTURL'] video_url = self.post['CONTENTURL']
audio_name = post['POSTID'] + "_audio" audio_name = self.post['POSTID'] + "_audio"
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
print(directory, filename, sep="\n") logger.info(self.directory, filename, sep="\n")
self.getFile(video_name, video_name, directory, video_url, silent=True) self._download_resource(video_name, self.directory, video_url, silent=True)
self.getFile(audio_name, audio_name, directory, audio_url, silent=True) self._download_resource(audio_name, self.directory, audio_url, silent=True)
try: try:
self._mergeAudio(video_name, audio_name, filename, short_filename, directory) self._merge_audio(video_name, audio_name, filename, self.directory)
except KeyboardInterrupt: except KeyboardInterrupt:
os.remove(directory / filename) (self.directory / filename).unlink()
os.remove(directory / audio_name) (self.directory / audio_name).unlink()
os.rename(directory / video_name, directory / filename) (self.directory / video_name).unlink()
(self.directory / filename).unlink()
@staticmethod @staticmethod
def _mergeAudio( def _merge_audio(
video: pathlib.Path, video: pathlib.Path,
audio: pathlib.Path, audio: pathlib.Path,
filename: pathlib.Path, filename: pathlib.Path,
short_filename,
directory: pathlib.Path): directory: pathlib.Path):
input_video = str(directory / video) input_video = str(directory / video)
input_audio = str(directory / audio) input_audio = str(directory / audio)
@ -55,5 +60,5 @@ class VReddit(BaseDownloader):
input_audio, input_video, str(directory / filename)) input_audio, input_video, str(directory / filename))
subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT) subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT)
os.remove(directory / video) (directory / video).unlink()
os.remove(directory / audio) (directory / audio).unlink()

View file

@ -1,3 +1,6 @@
#!/usr/bin/env python3
import logging
import os import os
import pathlib import pathlib
import sys import sys
@ -7,21 +10,24 @@ import youtube_dl
from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError from bulkredditdownloader.errors import FileAlreadyExistsError
from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print
logger = logging.getLogger(__name__)
class Youtube(BaseDownloader): class Youtube(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict): def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post) super().__init__(directory, post)
if not os.path.exists(directory): self.download()
os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post) def download(self):
print(filename) self.directory.mkdir(exist_ok=True)
self.download(filename, directory, post['CONTENTURL']) filename = GLOBAL.config['filename'].format(**self.post)
logger.info(filename)
def download(self, filename: str, directory: pathlib.Path, url: str): self._download_video(filename, self.directory, self.post['CONTENTURL'])
def _download_video(self, filename: str, directory: pathlib.Path, url: str):
ydl_opts = { ydl_opts = {
"format": "best", "format": "best",
"outtmpl": str(directory / (filename + ".%(ext)s")), "outtmpl": str(directory / (filename + ".%(ext)s")),
@ -35,9 +41,12 @@ class Youtube(BaseDownloader):
location = directory / (filename + ".mp4") location = directory / (filename + ".mp4")
with open(location, 'rb') as file:
content = file.read()
if GLOBAL.arguments.no_dupes: if GLOBAL.arguments.no_dupes:
try: try:
file_hash = self.createHash(str(location)) file_hash = self._create_hash(content)
except FileNotFoundError: except FileNotFoundError:
return None return None
if file_hash in GLOBAL.downloadedPosts(): if file_hash in GLOBAL.downloadedPosts():
@ -48,7 +57,7 @@ class Youtube(BaseDownloader):
@staticmethod @staticmethod
def _hook(d): def _hook(d):
if d['status'] == 'finished': if d['status'] == 'finished':
return print("Downloaded") return logger.info("Downloaded")
downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6))) downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6)))
file_size = int(d['total_bytes'] * (10**(-6))) file_size = int(d['total_bytes'] * (10**(-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size)) sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size))