1
0
Fork 0
mirror of synced 2024-08-23 14:01:40 +12:00
ArchiveBox/archivebox/plugins/gallerydl/models.py

196 lines
6.1 KiB
Python
Raw Normal View History

2024-01-20 03:45:33 +13:00
from solo.models import SingletonModel
class GalleryDLDependency(SingletonModel):
GALLERYDL_ENABLED = models.BooleanField(default=True)
GALLERYDL_BINARY = models.CharField(max_length=255, default='gallery-dl')
2024-01-20 04:09:24 +13:00
# GALLERYDL_WORKERS = models.IntegerField(default='{NUM_CORES}')
2024-01-20 03:45:33 +13:00
def __str__(self):
return "GalleryDL Dependency Configuration"
class Meta:
verbose_name = "GalleryDL Dependency Configuration"
@cached_property
def bin_path(self):
return bin_path(self.GALLERYDL_BINARY)
@cached_property
def bin_version(self):
return bin_version(self.bin_path)
@cached_property
def is_valid(self):
return self.bin_path and self.bin_version
@cached_property
def enabled(self):
return self.GALLERYDL_ENABLED and self.is_valid
2024-01-20 04:09:24 +13:00
def run(args, pwd, timeout):
errors = None
timer = TimedProgress(timeout, prefix=' ')
try:
proc = run(cmd=[self.bin_path, *args]=True, pwd=pwd, timeout=timeout)run(cmd=[self.bin_path, *args]=True, pwd=pwd, timeout=timeout)
except Exception as err:
errors = err
finally:
timer.end()
return proc, timer, errors
2024-01-20 03:45:33 +13:00
def pretty_version(self):
if self.enabled:
if self.is_valid:
color, symbol, note, version = 'green', '', 'valid', ''
parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not self.bin_version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
path = pretty_path(self.bin_path)
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(21),
version.ljust(14),
ANSI[color],
note.ljust(8),
ANSI['reset'],
path.ljust(76),
))
class GalleryDLExtractor(SingletonModel):
GALLERYDL_EXTRACTOR_NAME = 'gallerydl'
SAVE_GALLERYDL = models.BooleanField(default=True)
GALLERYDL_DEPENDENCY = GalleryDLDependency.get_solo()
# https://github.com/mikf/gallery-dl
GALLERYDL_ARGS = models.CSVField(max_length=255, default=[])
GALLERYDL_TIMEOUT = models.IntegerField(default=lambda c: c['TIMEOUT'])
GALLERYDL_USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
GALLERYDL_COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
ALIASES = {
'SAVE_GALLERYDL': ('USE_GALLERYDL', 'FETCH_GALLERYDL'),
}
@cached_property
def enabled(self):
return self.SAVE_GALLERYDL and self.GALLERYDL_DEPENDENCY.is_valid
def __str__(self):
return "GalleryDL Extractor Configuration"
class Meta:
verbose_name = "GalleryDL Extractor Configuration"
def __json__(self):
return {
'SAVE_GALLERYDL': self.SAVE_GALLERYDL,
'GALLERYDL_DEPENDENCY': self.GALLERYDL_DEPENDENCY.__json__(),
'GALLERYDL_ARGS': self.GALLERYDL_ARGS,
'GALLERYDL_TIMEOUT': self.GALLERYDL_TIMEOUT,
'GALLERYDL_USER_AGENT': self.GALLERYDL_USER_AGENT,
'GALLERYDL_COOKIES_TXT': self.GALLERYDL_COOKIES_TXT,
}
def validate(self):
assert 5 < self.GALLERYDL_TIMEOUT, 'GALLERYDL_TIMEOUT must be at least 5 seconds'
# assert Path(self.GALLERYDL_COOKIES_TXT).exists()
# TODO: validate user agent with uaparser
# TODO: validate args, cookies.txt?
def save(self, *args, **kwargs):
self.validate()
with transaction.atomic():
result = super().save(*args, **kwargs)
emit_event({'type': 'GalleryDLExtractor.save', 'diff': self.__json__(), 'kwargs': kwargs})
# potential consumers of this event:
# - event logger: write to events.log
# - config file updater: writes to ArchiveBox.conf
# - supervisor: restarts relevant dependencies/extractors
# - etc...
return result
def create_extractor_directory(self, parent_dir: Path):
return subdir = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).mkdir(exist_ok=True)
def should_extract(self, parent_dir: Path):
existing_files = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).glob('*')
return not existing_files
def extract(self, url: str, out_dir: Path):
if not self.enabled:
return
extractor_dir = self.create_extractor_directory(out_dir)
cmd = [
self.GALLERYDL_DEPENDENCY.bin_path,
url,
'--timeout', GALLERYDL_TIMEOUT,
'--cookies', GALLERYDL_COOKIES_TXT,
'--user-agent', GALLERYDL_USER_AGENT,
'--verify', config.CHECK_SSL_VALIDITY
*self.GALLERYDL_ARGS,
]
status, stdout, stderr, output_path = 'failed', '', '', None
try:
2024-01-20 04:09:24 +13:00
proc, timer, errors = self.GALLERYDL_DEPENDENCY.run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT)
2024-01-20 03:45:33 +13:00
stdout, stderr = proc.stdout, proc.stderr
if 'ERROR: Unsupported URL' in stderr:
hints = ('gallery-dl doesnt support this type of url yet',)
raise ArchiveError('Failed to save gallerydl', hints)
if proc.returncode == 0 and 'finished' in stdout:
output_path = extractor_dir / 'index.html'
status = 'succeeded'
except Exception as err:
stderr += err
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
return ArchiveResult(
status=status,
cmd=cmd,
pwd=str(out_dir),
cmd_version=self.GALLERYDL_DEPENDENCY.bin_version,
cmd_path=self.GALLERYDL_DEPENDENCY.bin_path,
cmd_hostname=config.HOSTNAME,
output_path=output_path,
stdout=stdout,
stderr=stderr,
num_bytes=num_bytes,
num_files=num_files,
num_dirs=num_dirs,
**timer.stats,
)