Calculate existing hashes in parallel

2024-06-26 18:10:26 +12:00 · 2021-03-26 10:42:51 +10:00 · 2021-03-26 10:42:51 +10:00 · 771cc711e4
parent 79105f9f84
commit 771cc711e4
1 changed files with 13 additions and 5 deletions
--- a/bulkredditdownloader/downloader.py
+++ b/bulkredditdownloader/downloader.py
@ -11,6 +11,7 @@ import shutil
 import socket
 from datetime import datetime
 from enum import Enum, auto
+from multiprocessing import Pool
 from pathlib import Path
 from typing import Iterator

@ -31,6 +32,12 @@ from bulkredditdownloader.site_downloaders.download_factory import DownloadFacto
 logger = logging.getLogger(__name__)


+def _calc_hash(existing_file: Path):
+    with open(existing_file, 'rb') as file:
+        file_hash = hashlib.md5(file.read()).hexdigest()
+        return existing_file, file_hash
+
+
 class RedditTypes:
    class SortType(Enum):
        HOT = auto()
@ -373,9 +380,10 @@ class RedditDownloader:
        for (dirpath, dirnames, filenames) in os.walk(directory):
            files.extend([Path(dirpath, file) for file in filenames])
        logger.info(f'Calculating hashes for {len(files)} files')
-        hash_list = {}
-        for existing_file in files:
-            with open(existing_file, 'rb') as file:
-                hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file
-                logger.log(9, f'Hash calculated for file at {existing_file}')
+
+        pool = Pool(15)
+        results = pool.map(_calc_hash, files)
+        pool.close()
+
+        hash_list = {res[1]: res[0] for res in results}
        return hash_list