1
0
Fork 0
mirror of synced 2024-06-10 22:34:40 +12:00

Calculate existing hashes in parallel

This commit is contained in:
Serene-Arc 2021-03-26 10:42:51 +10:00 committed by Ali Parlakci
parent 79105f9f84
commit 771cc711e4

View file

@ -11,6 +11,7 @@ import shutil
import socket
from datetime import datetime
from enum import Enum, auto
from multiprocessing import Pool
from pathlib import Path
from typing import Iterator
@ -31,6 +32,12 @@ from bulkredditdownloader.site_downloaders.download_factory import DownloadFacto
logger = logging.getLogger(__name__)
def _calc_hash(existing_file: Path):
with open(existing_file, 'rb') as file:
file_hash = hashlib.md5(file.read()).hexdigest()
return existing_file, file_hash
class RedditTypes:
class SortType(Enum):
HOT = auto()
@ -373,9 +380,10 @@ class RedditDownloader:
for (dirpath, dirnames, filenames) in os.walk(directory):
files.extend([Path(dirpath, file) for file in filenames])
logger.info(f'Calculating hashes for {len(files)} files')
hash_list = {}
for existing_file in files:
with open(existing_file, 'rb') as file:
hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
logger.log(9, f'Hash calculated for file at {existing_file}')
pool = Pool(15)
results = pool.map(_calc_hash, files)
pool.close()
hash_list = {res[1]: res[0] for res in results}
return hash_list