bulk-downloader-for-reddit/src/downloaders/downloaderUtils.py

import sys
import os
import time
from urllib.error import HTTPError
import urllib.request
from pathlib import Path
import hashlib

from src.utils import nameCorrector, GLOBAL
from src.utils import printToFile as print
from src.errors import FileAlreadyExistsError, FileNameTooLong, FailedToDownload, TypeInSkip, DomainInSkip

def dlProgress(count, blockSize, totalSize):
    """Function for writing download progress to console
    """

    downloadedMbs = int(count*blockSize*(10**(-6)))
    fileSize = int(totalSize*(10**(-6)))
    sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs,fileSize))
    sys.stdout.flush()

def getExtension(link):
    """Extract file extension from image link.
    If didn't find any, return '.jpg'
    """

    imageTypes = ['jpg','png','mp4','webm','gif']
    parsed = link.split('.')
    for fileType in imageTypes:
        if fileType in parsed:
            return "."+parsed[-1]
    else:
        if not "v.redd.it" in link:
            return '.jpg'
        else:
            return '.mp4'

def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):

    FORMATS = {
        "videos": [".mp4", ".webm"],
        "images": [".jpg",".jpeg",".png",".bmp"],
        "gifs": [".gif"]
    }

    for type in GLOBAL.arguments.skip:
        for extension in FORMATS[type]:
            if extension in filename:
                raise TypeInSkip

    if any(domain in imageURL for domain in GLOBAL.arguments.skip_domain):
        raise DomainInSkip

    headers = [
        ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "\
            "Safari/537.36 OPR/54.0.2952.64"),
        ("Accept", "text/html,application/xhtml+xml,application/xml;" \
            "q=0.9,image/webp,image/apng,*/*;q=0.8"),
        ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
        ("Accept-Encoding", "none"),
        ("Accept-Language", "en-US,en;q=0.8"),
        ("Connection", "keep-alive")
    ]

    if not os.path.exists(folderDir): os.makedirs(folderDir)

    opener = urllib.request.build_opener()
    if not "imgur" in imageURL:
        opener.addheaders = headers
    urllib.request.install_opener(opener)

    if not silent: print(" "*indent + str(folderDir),
                         " "*indent + str(filename),
                         sep="\n")


    for i in range(3):
        fileDir = Path(folderDir) / filename
        tempDir = Path(folderDir) / (filename+".tmp")

        if not (os.path.isfile(fileDir)):
            try:
                urllib.request.urlretrieve(imageURL,
                                           tempDir,
                                           reporthook=dlProgress)

                fileHash = createHash(tempDir)
                if GLOBAL.arguments.no_dupes:
                    if fileHash in GLOBAL.downloadedPosts():
                        os.remove(tempDir)
                        raise FileAlreadyExistsError
                GLOBAL.downloadedPosts.add(fileHash)

                os.rename(tempDir,fileDir)
                if not silent: print(" "*indent+"Downloaded"+" "*10)
                return None
            except ConnectionResetError as exception:
                if not silent: print(" "*indent + str(exception))
                if not silent: print(" "*indent + "Trying again\n")
            except FileNotFoundError:
                filename = shortFilename
        else:
            raise FileAlreadyExistsError
    raise FailedToDownload

def createHash(filename):
    hash_md5 = hashlib.md5()
    with open(filename, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`import sys`
			`import os`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`import time`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`from urllib.error import HTTPError`
			`import urllib.request`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`from pathlib import Path`
			`import hashlib`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`from src.utils import nameCorrector, GLOBAL`
			`from src.utils import printToFile as print`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`from src.errors import FileAlreadyExistsError, FileNameTooLong, FailedToDownload, TypeInSkip, DomainInSkip`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
			`def dlProgress(count, blockSize, totalSize):`
			`"""Function for writing download progress to console`
			`"""`

			`downloadedMbs = int(countblockSize(10**(-6)))`
			`fileSize = int(totalSize(10*(-6)))`
			`sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs,fileSize))`
			`sys.stdout.flush()`

			`def getExtension(link):`
			`"""Extract file extension from image link.`
			`If didn't find any, return '.jpg'`
			`"""`

			`imageTypes = ['jpg','png','mp4','webm','gif']`
			`parsed = link.split('.')`
			`for fileType in imageTypes:`
			`if fileType in parsed:`
			`return "."+parsed[-1]`
			`else:`
			`if not "v.redd.it" in link:`
			`return '.jpg'`
			`else:`
			`return '.mp4'`

v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`FORMATS = {`
			`"videos": [".mp4", ".webm"],`
			`"images": [".jpg",".jpeg",".png",".bmp"],`
			`"gifs": [".gif"]`
			`}`

			`for type in GLOBAL.arguments.skip:`
			`for extension in FORMATS[type]:`
			`if extension in filename:`
			`raise TypeInSkip`

			`if any(domain in imageURL for domain in GLOBAL.arguments.skip_domain):`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`raise DomainInSkip`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00
			`headers = [`
			`("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \`
			`"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "\`
			`"Safari/537.36 OPR/54.0.2952.64"),`
			`("Accept", "text/html,application/xhtml+xml,application/xml;" \`
			`"q=0.9,image/webp,image/apng,/;q=0.8"),`
			`("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),`
			`("Accept-Encoding", "none"),`
			`("Accept-Language", "en-US,en;q=0.8"),`
			`("Connection", "keep-alive")`
			`]`

v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`if not os.path.exists(folderDir): os.makedirs(folderDir)`

v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`opener = urllib.request.build_opener()`
			`if not "imgur" in imageURL:`
			`opener.addheaders = headers`
			`urllib.request.install_opener(opener)`

v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`if not silent: print(" "*indent + str(folderDir),`
			`" "*indent + str(filename),`
			`sep="\n")`


			`for i in range(3):`
			`fileDir = Path(folderDir) / filename`
			`tempDir = Path(folderDir) / (filename+".tmp")`

			`if not (os.path.isfile(fileDir)):`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`try:`
			`urllib.request.urlretrieve(imageURL,`
			`tempDir,`
			`reporthook=dlProgress)`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`fileHash = createHash(tempDir)`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`if GLOBAL.arguments.no_dupes:`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`if fileHash in GLOBAL.downloadedPosts():`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`os.remove(tempDir)`
			`raise FileAlreadyExistsError`
v1.9.0 (#114) * IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes 2020-06-04 03:10:25 +12:00			`GLOBAL.downloadedPosts.add(fileHash)`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`os.rename(tempDir,fileDir)`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`if not silent: print(" "indent+"Downloaded"+" "10)`
			`return None`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`except ConnectionResetError as exception:`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`if not silent: print(" "*indent + str(exception))`
			`if not silent: print(" "*indent + "Trying again\n")`
v1.7.0 (#97) * tools file name change to utils * Seperate downloaders (#94) * Seperated the downloaders * Remove redundant code * Changed file names * refactor * Redgifs (#95) * Init commit * Init commit * GifDeliveryNetwork (#96) * Initial commit * Gfycat forwarding to GDN bug fixed 2020-05-29 06:42:11 +12:00			`except FileNotFoundError:`
v1.8.0 (#105) ## Change log - Youtube support added - Custom filenames feature added - Custom folder structure feature added - Unsaving downloaded posts option added - Remove duplicate posts on different subreddits option added - Skipping given domains option added - Keeping track of already downloaded posts on a separate file option added (See --dowloaded-posts in README) - No audio on v.redd.it videos bug fixed (see README for details about ffmpeg) - --default-directory option is added - --default-options is added - --use-local-config option is added - Bug fixes 2020-06-02 00:05:02 +12:00			`filename = shortFilename`
			`else:`
			`raise FileAlreadyExistsError`
			`raise FailedToDownload`

			`def createHash(filename):`
			`hash_md5 = hashlib.md5()`
			`with open(filename, "rb") as f:`
			`for chunk in iter(lambda: f.read(4096), b""):`
			`hash_md5.update(chunk)`
			`return hash_md5.hexdigest()`