1
0
Fork 0
mirror of synced 2024-06-13 16:05:18 +12:00
ArchiveBox/archivebox/plugins/gallerydl/models.py
2024-01-24 00:07:07 -08:00

122 lines
3.8 KiB
Python

from django.db import models
from django.utils.functional import cached_property
from solo.models import SingletonModel
from archivebox.plugins.defaults.models import (
ArchiveBoxDefaultDependency,
ArchiveBoxDefaultExtractor,
BashEnvironmentDependency,
PipEnvironmentDependency,
)
class GalleryDLDependency(ArchiveBoxDefaultDependency, SingletonModel):
NAME = 'GALLERYDL'
LABEL = "GalleryDL"
REQUIRED = False
PARENT_DEPENDENCIES = [
BashEnvironmentDependency,
PipEnvironmentDependency,
]
BIN_DEPENDENCIES = ['gallery-dl']
APT_DEPENDENCIES = []
BREW_DEPENDENCIES = []
PIP_PACKAGES = ['gallery-dl']
NPM_PACKAGES = []
DEFAULT_BINARY = 'gallery-dl'
DEFAULT_START_CMD = None
DEFAULT_ARGS = []
VERSION_CMD = '{BINARY} --version'
ENABLED = models.BooleanField(default=True)
BINARY = models.CharField(max_length=255, default='gallery-dl')
WORKERS = models.IntegerField(default='1')
class GalleryDLExtractor(ArchiveBoxDefaultExtractor, SingletonModel):
NAME = 'GALLERYDL'
LABEL = 'gallery-dl'
DEPENDENCY = GalleryDLDependency.get_solo()
# https://github.com/mikf/gallery-dl
DEFAULT_CMD = [
'{DEPENDENCY.BINARY}',
'{ARGS}'
'{url}',
]
DEFAULT_ARGS = [
'--timeout', self.TIMEOUT.format(**config),
'--cookies', self.COOKIES_TXT.format(**config),
'--user-agent', self.COOKIES_TXT.format(**config),
'--verify', self.CHECK_SSL_VALIDITY.format(**config),
]
ENABLED = models.BooleanField(default=True)
CMD = models.CharField(max_length=255, default=DEFAULT_CMD)
ARGS = models.CSVField(max_length=255, default=DEFAULT_ARGS)
TIMEOUT = models.CharField(max_length=255, default='{TIMEOUT}')
USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
CHECK_SSL_VALIDITY = models.CharField(default='{CHECK_SSL_VALIDITY}')
# @task
# @requires_config('HOSTNAME', 'TIMEOUT', 'USER_AGENT', 'CHECK_SSL_VALIDITY')
def extract(self, url: str, out_dir: Path, config: ConfigDict):
if not self.ENABLED:
return
extractor_dir = self.create_extractor_directory(out_dir)
cmd = [
self.CMD,
url,
'--timeout', self.TIMEOUT.format(**config),
'--cookies', self.COOKIES_TXT.format(**config),
'--user-agent', self.COOKIES_TXT.format(**config),
'--verify', self.CHECK_SSL_VALIDITY.format(**config),
*split_args(self.ARGS.format(**config)),
]
status, stdout, stderr, output_path = 'failed', '', '', None
try:
proc, timer, errors = self.DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT)
stdout, stderr = proc.stdout, proc.stderr
if 'ERROR: Unsupported URL' in stderr:
hints = ('gallery-dl doesnt support this type of url yet',)
raise ArchiveError('Failed to save gallerydl', hints)
if proc.returncode == 0 and 'finished' in stdout:
output_path = extractor_dir / 'index.html'
status = 'succeeded'
except Exception as err:
stderr += err
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=self.DEPENDENCY.bin_version,
cmd_path=self.DEPENDENCY.bin_path,
cmd_hostname=config.HOSTNAME,
output_path=output_path,
stdout=stdout,
stderr=stderr,
status=status,
num_bytes=num_bytes,
num_files=num_files,
num_dirs=num_dirs,
**timer.stats,
)