config.py 73 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528
  1. """
  2. ArchiveBox config definitons (including defaults and dynamic config options).
  3. Config Usage Example:
  4. archivebox config --set MEDIA_TIMEOUT=600
  5. env MEDIA_TIMEOUT=600 USE_COLOR=False ... archivebox [subcommand] ...
  6. Config Precedence Order:
  7. 1. cli args (--update-all / --index-only / etc.)
  8. 2. shell environment vars (env USE_COLOR=False archivebox add '...')
  9. 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf)
  10. 4. defaults (defined below in Python)
  11. Documentation:
  12. https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
  13. """
  14. __package__ = 'archivebox'
  15. import os
  16. import io
  17. import re
  18. import sys
  19. import json
  20. import inspect
  21. import getpass
  22. import platform
  23. import shutil
  24. import requests
  25. import django
  26. from sqlite3 import dbapi2 as sqlite3
  27. from hashlib import md5
  28. from pathlib import Path
  29. from datetime import datetime, timezone
  30. from typing import Optional, Type, Tuple, Dict, Union, List, Any
  31. from subprocess import run, PIPE, DEVNULL, STDOUT
  32. from configparser import ConfigParser
  33. from collections import defaultdict
  34. import importlib.metadata
  35. from .config_stubs import (
  36. AttrDict,
  37. SimpleConfigValueDict,
  38. ConfigValue,
  39. ConfigDict,
  40. ConfigDefaultValue,
  41. ConfigDefaultDict,
  42. )
  43. ############################### Config Schema ##################################
  44. CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
  45. 'SHELL_CONFIG': {
  46. 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
  47. 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
  48. 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
  49. 'IN_DOCKER': {'type': bool, 'default': False},
  50. 'IN_QEMU': {'type': bool, 'default': False},
  51. 'PUID': {'type': int, 'default': os.getuid()},
  52. 'PGID': {'type': int, 'default': os.getgid()},
  53. },
  54. 'GENERAL_CONFIG': {
  55. 'OUTPUT_DIR': {'type': str, 'default': None},
  56. 'CONFIG_FILE': {'type': str, 'default': None},
  57. 'ONLY_NEW': {'type': bool, 'default': True},
  58. 'TIMEOUT': {'type': int, 'default': 60},
  59. 'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
  60. 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
  61. 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, # TODO: move this to be a default WGET_ARGS
  62. 'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
  63. 'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
  64. 'ADMIN_USERNAME': {'type': str, 'default': None},
  65. 'ADMIN_PASSWORD': {'type': str, 'default': None},
  66. 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
  67. 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
  68. },
  69. 'SERVER_CONFIG': {
  70. 'SECRET_KEY': {'type': str, 'default': None},
  71. 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
  72. 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com
  73. 'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': ''}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
  74. 'DEBUG': {'type': bool, 'default': False},
  75. 'PUBLIC_INDEX': {'type': bool, 'default': True},
  76. 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
  77. 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
  78. 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
  79. 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
  80. 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
  81. 'TIME_ZONE': {'type': str, 'default': 'UTC'},
  82. 'TIMEZONE': {'type': str, 'default': 'UTC'},
  83. 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
  84. 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
  85. 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
  86. 'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
  87. 'LDAP': {'type': bool, 'default': False},
  88. 'LDAP_SERVER_URI': {'type': str, 'default': None},
  89. 'LDAP_BIND_DN': {'type': str, 'default': None},
  90. 'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
  91. 'LDAP_USER_BASE': {'type': str, 'default': None},
  92. 'LDAP_USER_FILTER': {'type': str, 'default': None},
  93. 'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
  94. 'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
  95. 'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
  96. 'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
  97. 'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
  98. },
  99. 'ARCHIVE_METHOD_TOGGLES': {
  100. 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
  101. 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
  102. 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
  103. 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
  104. 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
  105. 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
  106. 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
  107. 'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)},
  108. 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
  109. 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
  110. 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
  111. 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
  112. 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
  113. 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
  114. 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
  115. 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
  116. 'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
  117. 'SAVE_DENYLIST': {'type': dict, 'default': {},},
  118. },
  119. 'ARCHIVE_METHOD_OPTIONS': {
  120. 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
  121. 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
  122. 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
  123. 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
  124. 'USER_AGENT': {'type': str, 'default': None},
  125. 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
  126. 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
  127. 'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
  128. 'COOKIES_FILE': {'type': str, 'default': None},
  129. 'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
  130. 'CHROME_TIMEOUT': {'type': int, 'default': 0},
  131. 'CHROME_HEADLESS': {'type': bool, 'default': True},
  132. 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
  133. 'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
  134. 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
  135. '--restrict-filenames',
  136. '--trim-filenames', '128',
  137. '--write-description',
  138. '--write-info-json',
  139. '--write-annotations',
  140. '--write-thumbnail',
  141. '--no-call-home',
  142. '--write-sub',
  143. '--write-auto-subs',
  144. '--convert-subs=srt',
  145. '--yes-playlist',
  146. '--continue',
  147. # This flag doesn't exist in youtube-dl
  148. # only in yt-dlp
  149. '--no-abort-on-error',
  150. # --ignore-errors must come AFTER
  151. # --no-abort-on-error
  152. # https://github.com/yt-dlp/yt-dlp/issues/4914
  153. '--ignore-errors',
  154. '--geo-bypass',
  155. '--add-metadata',
  156. '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
  157. ]},
  158. 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
  159. 'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
  160. '--adjust-extension',
  161. '--convert-links',
  162. '--force-directories',
  163. '--backup-converted',
  164. '--span-hosts',
  165. '--no-parent',
  166. '-e', 'robots=off',
  167. ]},
  168. 'WGET_EXTRA_ARGS': {'type': list, 'default': None},
  169. 'CURL_ARGS': {'type': list, 'default': ['--silent',
  170. '--location',
  171. '--compressed'
  172. ]},
  173. 'CURL_EXTRA_ARGS': {'type': list, 'default': None},
  174. 'GIT_ARGS': {'type': list, 'default': ['--recursive']},
  175. 'SINGLEFILE_ARGS': {'type': list, 'default': None},
  176. 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
  177. 'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
  178. 'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
  179. 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
  180. },
  181. 'SEARCH_BACKEND_CONFIG' : {
  182. 'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
  183. 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
  184. 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
  185. 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
  186. 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
  187. 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
  188. 'SEARCH_PROCESS_HTML': {'type': bool, 'default': True},
  189. # SONIC
  190. 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
  191. 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
  192. 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
  193. # SQLite3 FTS5
  194. 'FTS_SEPARATE_DATABASE': {'type': bool, 'default': True},
  195. 'FTS_TOKENIZERS': {'type': str, 'default': 'porter unicode61 remove_diacritics 2'},
  196. # Default from https://www.sqlite.org/limits.html#max_length
  197. 'FTS_SQLITE_MAX_LENGTH': {'type': int, 'default': int(1e9)},
  198. },
  199. 'DEPENDENCY_CONFIG': {
  200. 'USE_CURL': {'type': bool, 'default': True},
  201. 'USE_WGET': {'type': bool, 'default': True},
  202. 'USE_SINGLEFILE': {'type': bool, 'default': True},
  203. 'USE_READABILITY': {'type': bool, 'default': True},
  204. 'USE_MERCURY': {'type': bool, 'default': True},
  205. 'USE_GIT': {'type': bool, 'default': True},
  206. 'USE_CHROME': {'type': bool, 'default': True},
  207. 'USE_NODE': {'type': bool, 'default': True},
  208. 'USE_YOUTUBEDL': {'type': bool, 'default': True},
  209. 'USE_RIPGREP': {'type': bool, 'default': True},
  210. 'CURL_BINARY': {'type': str, 'default': 'curl'},
  211. 'GIT_BINARY': {'type': str, 'default': 'git'},
  212. 'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
  213. 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
  214. 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
  215. 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
  216. 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
  217. 'NODE_BINARY': {'type': str, 'default': 'node'},
  218. 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
  219. 'CHROME_BINARY': {'type': str, 'default': None},
  220. 'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
  221. 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
  222. 'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
  223. },
  224. }
  225. ########################## Backwards-Compatibility #############################
  226. # for backwards compatibility with old config files, check old/deprecated names for each key
  227. CONFIG_ALIASES = {
  228. alias: key
  229. for section in CONFIG_SCHEMA.values()
  230. for key, default in section.items()
  231. for alias in default.get('aliases', ())
  232. }
  233. USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
  234. def get_real_name(key: str) -> str:
  235. """get the current canonical name for a given deprecated config key"""
  236. return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
  237. ################################ Constants #####################################
  238. PACKAGE_DIR_NAME = 'archivebox'
  239. TEMPLATES_DIR_NAME = 'templates'
  240. ARCHIVE_DIR_NAME = 'archive'
  241. SOURCES_DIR_NAME = 'sources'
  242. LOGS_DIR_NAME = 'logs'
  243. CACHE_DIR_NAME = 'cache'
  244. PERSONAS_DIR_NAME = 'personas'
  245. CRONTABS_DIR_NAME = 'crontabs'
  246. SQL_INDEX_FILENAME = 'index.sqlite3'
  247. JSON_INDEX_FILENAME = 'index.json'
  248. HTML_INDEX_FILENAME = 'index.html'
  249. ROBOTS_TXT_FILENAME = 'robots.txt'
  250. FAVICON_FILENAME = 'favicon.ico'
  251. CONFIG_FILENAME = 'ArchiveBox.conf'
  252. DEFAULT_CLI_COLORS = {
  253. 'reset': '\033[00;00m',
  254. 'lightblue': '\033[01;30m',
  255. 'lightyellow': '\033[01;33m',
  256. 'lightred': '\033[01;35m',
  257. 'red': '\033[01;31m',
  258. 'green': '\033[01;32m',
  259. 'blue': '\033[01;34m',
  260. 'white': '\033[01;37m',
  261. 'black': '\033[01;30m',
  262. }
  263. ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
  264. COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
  265. '00': [(0, 0, 0), (0, 0, 0)],
  266. '30': [(0, 0, 0), (0, 0, 0)],
  267. '31': [(255, 0, 0), (128, 0, 0)],
  268. '32': [(0, 200, 0), (0, 128, 0)],
  269. '33': [(255, 255, 0), (128, 128, 0)],
  270. '34': [(0, 0, 255), (0, 0, 128)],
  271. '35': [(255, 0, 255), (128, 0, 128)],
  272. '36': [(0, 255, 255), (0, 128, 128)],
  273. '37': [(255, 255, 255), (255, 255, 255)],
  274. })
  275. STATICFILE_EXTENSIONS = {
  276. # 99.999% of the time, URLs ending in these extensions are static files
  277. # that can be downloaded as-is, not html pages that need to be rendered
  278. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  279. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  280. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  281. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  282. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  283. 'atom', 'rss', 'css', 'js', 'json',
  284. 'dmg', 'iso', 'img',
  285. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  286. # Less common extensions to consider adding later
  287. # jar, swf, bin, com, exe, dll, deb
  288. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  289. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  290. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  291. # These are always treated as pages, not as static files, never add them:
  292. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  293. }
  294. # When initializing archivebox in a new directory, we check to make sure the dir is
  295. # actually empty so that we dont clobber someone's home directory or desktop by accident.
  296. # These files are exceptions to the is_empty check when we're trying to init a new dir,
  297. # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
  298. ALLOWED_IN_OUTPUT_DIR = {
  299. '.gitignore',
  300. 'lost+found',
  301. '.DS_Store',
  302. '.venv',
  303. 'venv',
  304. 'virtualenv',
  305. '.virtualenv',
  306. 'node_modules',
  307. 'package.json',
  308. 'package-lock.json',
  309. 'yarn.lock',
  310. 'static',
  311. 'sonic',
  312. 'search.sqlite3',
  313. CRONTABS_DIR_NAME,
  314. ARCHIVE_DIR_NAME,
  315. SOURCES_DIR_NAME,
  316. LOGS_DIR_NAME,
  317. CACHE_DIR_NAME,
  318. PERSONAS_DIR_NAME,
  319. SQL_INDEX_FILENAME,
  320. f'{SQL_INDEX_FILENAME}-wal',
  321. f'{SQL_INDEX_FILENAME}-shm',
  322. JSON_INDEX_FILENAME,
  323. HTML_INDEX_FILENAME,
  324. ROBOTS_TXT_FILENAME,
  325. FAVICON_FILENAME,
  326. CONFIG_FILENAME,
  327. f'{CONFIG_FILENAME}.bak',
  328. 'static_index.json',
  329. }
  330. ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
  331. CONSTANTS = {
  332. "PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME},
  333. "TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME},
  334. "ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME},
  335. "SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME},
  336. "LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME},
  337. "CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME},
  338. "PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME},
  339. "CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME},
  340. "SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME},
  341. "JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME},
  342. "HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME},
  343. "ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME},
  344. "FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME},
  345. "CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME},
  346. "DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS},
  347. "ANSI": {'default': lambda c: ANSI},
  348. "COLOR_DICT": {'default': lambda c: COLOR_DICT},
  349. "STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS},
  350. "ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
  351. "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
  352. }
  353. ############################## Version Config ##################################
  354. def get_system_user() -> str:
  355. # some host OS's are unable to provide a username (k3s, Windows), making this complicated
  356. # uid 999 is especially problematic and breaks many attempts
  357. SYSTEM_USER = None
  358. FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
  359. # Option 1
  360. try:
  361. import pwd
  362. SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
  363. except (ModuleNotFoundError, Exception):
  364. pass
  365. # Option 2
  366. try:
  367. SYSTEM_USER = SYSTEM_USER or getpass.getuser()
  368. except Exception:
  369. pass
  370. # Option 3
  371. try:
  372. SYSTEM_USER = SYSTEM_USER or os.getlogin()
  373. except Exception:
  374. pass
  375. return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
  376. def get_version(config):
  377. try:
  378. return importlib.metadata.version(__package__ or 'archivebox')
  379. except importlib.metadata.PackageNotFoundError:
  380. try:
  381. pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
  382. for line in pyproject_config:
  383. if line.startswith('version = '):
  384. return line.split(' = ', 1)[-1].strip('"')
  385. except FileNotFoundError:
  386. # building docs, pyproject.toml is not available
  387. return 'dev'
  388. raise Exception('Failed to detect installed archivebox version!')
  389. def get_commit_hash(config) -> Optional[str]:
  390. try:
  391. git_dir = config['PACKAGE_DIR'] / '../.git'
  392. ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
  393. commit_hash = git_dir.joinpath(ref).read_text().strip()
  394. return commit_hash
  395. except Exception:
  396. pass
  397. try:
  398. return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
  399. except Exception:
  400. pass
  401. return None
  402. def get_build_time(config) -> str:
  403. if config['IN_DOCKER']:
  404. docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
  405. return docker_build_end_time
  406. src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
  407. return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
  408. def get_versions_available_on_github(config):
  409. """
  410. returns a dictionary containing the ArchiveBox GitHub release info for
  411. the recommended upgrade version and the currently installed version
  412. """
  413. # we only want to perform the (relatively expensive) check for new versions
  414. # when its most relevant, e.g. when the user runs a long-running command
  415. subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
  416. long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
  417. if subcommand_run_by_user not in long_running_commands:
  418. return None
  419. github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
  420. response = requests.get(github_releases_api)
  421. if response.status_code != 200:
  422. stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
  423. return None
  424. all_releases = response.json()
  425. installed_version = parse_version_string(config['VERSION'])
  426. # find current version or nearest older version (to link to)
  427. current_version = None
  428. for idx, release in enumerate(all_releases):
  429. release_version = parse_version_string(release['tag_name'])
  430. if release_version <= installed_version:
  431. current_version = release
  432. break
  433. current_version = current_version or all_releases[-1]
  434. # recommended version is whatever comes after current_version in the release list
  435. # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
  436. try:
  437. recommended_version = all_releases[idx+1]
  438. except IndexError:
  439. recommended_version = None
  440. return {'recommended_version': recommended_version, 'current_version': current_version}
  441. def can_upgrade(config):
  442. if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
  443. recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
  444. current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
  445. return recommended_version > current_version
  446. return False
  447. ############################## Derived Config ##################################
  448. # These are derived/computed values calculated *after* all user-provided config values are ingested
  449. # they appear in `archivebox config` output and are intended to be read-only for the user
  450. DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
  451. **CONSTANTS,
  452. 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
  453. 'USER': {'default': lambda c: get_system_user()},
  454. 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
  455. 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
  456. 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
  457. 'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
  458. 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
  459. 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
  460. 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
  461. 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
  462. 'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
  463. 'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
  464. 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
  465. 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
  466. 'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
  467. 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
  468. 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
  469. 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
  470. 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
  471. 'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string
  472. 'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit
  473. 'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time
  474. 'VERSIONS_AVAILABLE': {'default': lambda c: get_versions_available_on_github(c)},
  475. 'CAN_UPGRADE': {'default': lambda c: can_upgrade(c)},
  476. 'PYTHON_BINARY': {'default': lambda c: sys.executable},
  477. 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
  478. 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
  479. 'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
  480. 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
  481. 'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
  482. 'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
  483. #'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
  484. #'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
  485. 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
  486. 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
  487. 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
  488. 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
  489. 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
  490. 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
  491. 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
  492. 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
  493. 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
  494. 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
  495. 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
  496. 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
  497. 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
  498. 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
  499. 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
  500. 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
  501. 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
  502. 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
  503. 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
  504. 'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
  505. 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
  506. 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
  507. 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
  508. 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
  509. 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
  510. 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
  511. 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
  512. 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
  513. 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
  514. 'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
  515. 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
  516. 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
  517. 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
  518. 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
  519. 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
  520. 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
  521. 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
  522. 'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)},
  523. 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
  524. 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
  525. 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
  526. 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
  527. 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
  528. 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
  529. 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
  530. 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
  531. 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
  532. 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
  533. 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
  534. 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
  535. 'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
  536. 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
  537. 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
  538. }
  539. ################################### Helpers ####################################
  540. def load_config_val(key: str,
  541. default: ConfigDefaultValue=None,
  542. type: Optional[Type]=None,
  543. aliases: Optional[Tuple[str, ...]]=None,
  544. config: Optional[ConfigDict]=None,
  545. env_vars: Optional[os._Environ]=None,
  546. config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
  547. """parse bool, int, and str key=value pairs from env"""
  548. assert isinstance(config, dict)
  549. is_read_only = type is None
  550. if is_read_only:
  551. if callable(default):
  552. return default(config)
  553. return default
  554. # get value from environment variables or config files
  555. config_keys_to_check = (key, *(aliases or ()))
  556. val = None
  557. for key in config_keys_to_check:
  558. if env_vars:
  559. val = env_vars.get(key)
  560. if val:
  561. break
  562. if config_file_vars:
  563. val = config_file_vars.get(key)
  564. if val:
  565. break
  566. is_unset = val is None
  567. if is_unset:
  568. if callable(default):
  569. return default(config)
  570. return default
  571. # calculate value based on expected type
  572. BOOL_TRUEIES = ('true', 'yes', '1')
  573. BOOL_FALSEIES = ('false', 'no', '0')
  574. if type is bool:
  575. if val.lower() in BOOL_TRUEIES:
  576. return True
  577. elif val.lower() in BOOL_FALSEIES:
  578. return False
  579. else:
  580. raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
  581. elif type is str:
  582. if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES):
  583. raise ValueError(f'Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)')
  584. return val.strip()
  585. elif type is int:
  586. if not val.strip().isdigit():
  587. raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
  588. return int(val.strip())
  589. elif type is list or type is dict:
  590. return json.loads(val)
  591. raise Exception('Config values can only be str, bool, int, or json')
  592. def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
  593. """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
  594. out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
  595. assert out_dir and out_dir.is_dir()
  596. config_path = Path(out_dir) / CONFIG_FILENAME
  597. if config_path.exists():
  598. config_file = ConfigParser()
  599. config_file.optionxform = str
  600. config_file.read(config_path)
  601. # flatten into one namespace
  602. config_file_vars = ConfigDict({
  603. key.upper(): val
  604. for section, options in config_file.items()
  605. for key, val in options.items()
  606. })
  607. # print('[i] Loaded config file', os.path.abspath(config_path))
  608. # print(config_file_vars)
  609. return config_file_vars
  610. return None
  611. def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
  612. """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
  613. from .system import atomic_write
  614. CONFIG_HEADER = (
  615. """# This is the config file for your ArchiveBox collection.
  616. #
  617. # You can add options here manually in INI format, or automatically by running:
  618. # archivebox config --set KEY=VALUE
  619. #
  620. # If you modify this file manually, make sure to update your archive after by running:
  621. # archivebox init
  622. #
  623. # A list of all possible config with documentation and examples can be found here:
  624. # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
  625. """)
  626. out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
  627. config_path = Path(out_dir) / CONFIG_FILENAME
  628. if not config_path.exists():
  629. atomic_write(config_path, CONFIG_HEADER)
  630. config_file = ConfigParser()
  631. config_file.optionxform = str
  632. config_file.read(config_path)
  633. with open(config_path, 'r', encoding='utf-8') as old:
  634. atomic_write(f'{config_path}.bak', old.read())
  635. find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
  636. # Set up sections in empty config file
  637. for key, val in config.items():
  638. section = find_section(key)
  639. if section in config_file:
  640. existing_config = dict(config_file[section])
  641. else:
  642. existing_config = {}
  643. config_file[section] = ConfigDict({**existing_config, key: val})
  644. # always make sure there's a SECRET_KEY defined for Django
  645. existing_secret_key = None
  646. if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
  647. existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
  648. if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
  649. from django.utils.crypto import get_random_string
  650. chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
  651. random_secret_key = get_random_string(50, chars)
  652. if 'SERVER_CONFIG' in config_file:
  653. config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
  654. else:
  655. config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
  656. with open(config_path, 'w+', encoding='utf-8') as new:
  657. config_file.write(new)
  658. try:
  659. # validate the config by attempting to re-parse it
  660. CONFIG = load_all_config()
  661. except BaseException: # lgtm [py/catch-base-exception]
  662. # something went horribly wrong, rever to the previous version
  663. with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
  664. atomic_write(config_path, old.read())
  665. raise
  666. if Path(f'{config_path}.bak').exists():
  667. os.remove(f'{config_path}.bak')
  668. return {
  669. key.upper(): CONFIG.get(key.upper())
  670. for key in config.keys()
  671. }
  672. def load_config(defaults: ConfigDefaultDict,
  673. config: Optional[ConfigDict]=None,
  674. out_dir: Optional[str]=None,
  675. env_vars: Optional[os._Environ]=None,
  676. config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
  677. env_vars = env_vars or os.environ
  678. config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
  679. extended_config: ConfigDict = config.copy() if config else {}
  680. for key, default in defaults.items():
  681. try:
  682. extended_config[key] = load_config_val(
  683. key,
  684. default=default['default'],
  685. type=default.get('type'),
  686. aliases=default.get('aliases'),
  687. config=extended_config,
  688. env_vars=env_vars,
  689. config_file_vars=config_file_vars,
  690. )
  691. except KeyboardInterrupt:
  692. raise SystemExit(0)
  693. except Exception as e:
  694. stderr()
  695. stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
  696. stderr(' {}: {}'.format(e.__class__.__name__, e))
  697. stderr()
  698. stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
  699. stderr()
  700. stderr(' For config documentation and examples see:')
  701. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
  702. stderr()
  703. # raise
  704. raise SystemExit(2)
  705. return AttrDict(extended_config)
  706. def parse_version_string(version: str) -> Tuple[int, int, int]:
  707. """parses a version tag string formatted like 'vx.x.x' into (major, minor, patch) ints"""
  708. base = version.split('+')[0].split('v')[-1] # remove 'v' prefix and '+editable' suffix
  709. return tuple(int(part) for part in base.split('.'))[:3]
  710. # Logging Helpers
  711. def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
  712. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  713. if color:
  714. strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
  715. else:
  716. strs = [' '.join(str(a) for a in args), '\n']
  717. sys.stdout.write(prefix + ''.join(strs))
  718. def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
  719. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  720. if color:
  721. strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
  722. else:
  723. strs = [' '.join(str(a) for a in args), '\n']
  724. sys.stderr.write(prefix + ''.join(strs))
  725. def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None:
  726. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  727. if isinstance(text, str):
  728. stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
  729. else:
  730. stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
  731. for line in text[1:]:
  732. stderr('{} {}'.format(prefix, line))
  733. # Dependency Metadata Helpers
  734. def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str]:
  735. """check the presence and return valid version line of a specified binary"""
  736. abspath = bin_path(binary)
  737. if not binary or not abspath:
  738. return None
  739. try:
  740. bin_env = os.environ | {'LANG': 'C'}
  741. is_cmd_str = cmd and isinstance(cmd, str)
  742. version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env).stdout.strip().decode()
  743. if not version_str:
  744. version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
  745. # take first 3 columns of first line of version info
  746. version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
  747. try:
  748. version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
  749. if version_nums:
  750. return version_nums
  751. else:
  752. raise IndexError
  753. except IndexError:
  754. # take first 3 columns of first line of version info
  755. return ' '.join(version_str.split('\n')[0].strip().split()[:3])
  756. except OSError:
  757. pass
  758. # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
  759. # stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
  760. # stderr(f' {binary} --version')
  761. # stderr()
  762. # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
  763. # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
  764. return None
  765. def bin_path(binary: Optional[str]) -> Optional[str]:
  766. if binary is None:
  767. return None
  768. node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
  769. if node_modules_bin.exists():
  770. return str(node_modules_bin.resolve())
  771. return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
  772. def bin_hash(binary: Optional[str]) -> Optional[str]:
  773. if binary is None:
  774. return None
  775. abs_path = bin_path(binary)
  776. if abs_path is None or not Path(abs_path).exists():
  777. return None
  778. file_hash = md5()
  779. with io.open(abs_path, mode='rb') as f:
  780. for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
  781. file_hash.update(chunk)
  782. return f'md5:{file_hash.hexdigest()}'
  783. def find_chrome_binary() -> Optional[str]:
  784. """find any installed chrome binaries in the default locations"""
  785. # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
  786. # make sure data dir finding precedence order always matches binary finding order
  787. default_executable_paths = (
  788. # '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
  789. 'chromium-browser',
  790. 'chromium',
  791. '/Applications/Chromium.app/Contents/MacOS/Chromium',
  792. 'chrome',
  793. 'google-chrome',
  794. '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
  795. 'google-chrome-stable',
  796. 'google-chrome-beta',
  797. 'google-chrome-canary',
  798. '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
  799. 'google-chrome-unstable',
  800. 'google-chrome-dev',
  801. )
  802. for name in default_executable_paths:
  803. full_path_exists = shutil.which(name)
  804. if full_path_exists:
  805. return name
  806. return None
  807. def find_chrome_data_dir() -> Optional[str]:
  808. """find any installed chrome user data directories in the default locations"""
  809. # deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
  810. # Going forward we want to discourage people from using their main chrome profile for archiving.
  811. # Session tokens, personal data, and cookies are often returned in server responses,
  812. # when they get archived, they are essentially burned as anyone who can view the archive
  813. # can use that data to masquerade as the logged-in user that did the archiving.
  814. # For this reason users should always create dedicated burner profiles for archiving and not use
  815. # their daily driver main accounts.
  816. # # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
  817. # # make sure data dir finding precedence order always matches binary finding order
  818. # default_profile_paths = (
  819. # '~/.config/chromium',
  820. # '~/Library/Application Support/Chromium',
  821. # '~/AppData/Local/Chromium/User Data',
  822. # '~/.config/chrome',
  823. # '~/.config/google-chrome',
  824. # '~/Library/Application Support/Google/Chrome',
  825. # '~/AppData/Local/Google/Chrome/User Data',
  826. # '~/.config/google-chrome-stable',
  827. # '~/.config/google-chrome-beta',
  828. # '~/Library/Application Support/Google/Chrome Canary',
  829. # '~/AppData/Local/Google/Chrome SxS/User Data',
  830. # '~/.config/google-chrome-unstable',
  831. # '~/.config/google-chrome-dev',
  832. # )
  833. # for path in default_profile_paths:
  834. # full_path = Path(path).resolve()
  835. # if full_path.exists():
  836. # return full_path
  837. return None
  838. def wget_supports_compression(config):
  839. try:
  840. cmd = [
  841. config['WGET_BINARY'],
  842. "--compression=auto",
  843. "--help",
  844. ]
  845. return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
  846. except (FileNotFoundError, OSError):
  847. return False
  848. def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
  849. return {
  850. 'PACKAGE_DIR': {
  851. 'path': (config['PACKAGE_DIR']).resolve(),
  852. 'enabled': True,
  853. 'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
  854. },
  855. 'TEMPLATES_DIR': {
  856. 'path': (config['TEMPLATES_DIR']).resolve(),
  857. 'enabled': True,
  858. 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
  859. },
  860. # 'NODE_MODULES_DIR': {
  861. # 'path': ,
  862. # 'enabled': ,
  863. # 'is_valid': (...).exists(),
  864. # },
  865. }
  866. def get_data_locations(config: ConfigDict) -> ConfigValue:
  867. return {
  868. # OLD: migrating to personas
  869. # 'CHROME_USER_DATA_DIR': {
  870. # 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
  871. # 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
  872. # 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
  873. # },
  874. # 'COOKIES_FILE': {
  875. # 'path': os.path.abspath(config['COOKIES_FILE']),
  876. # 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
  877. # 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
  878. # },
  879. 'OUTPUT_DIR': {
  880. 'path': config['OUTPUT_DIR'].resolve(),
  881. 'enabled': True,
  882. 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
  883. 'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
  884. },
  885. 'CONFIG_FILE': {
  886. 'path': config['CONFIG_FILE'].resolve(),
  887. 'enabled': True,
  888. 'is_valid': config['CONFIG_FILE'].exists(),
  889. },
  890. 'SQL_INDEX': {
  891. 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
  892. 'enabled': True,
  893. 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
  894. 'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
  895. },
  896. 'ARCHIVE_DIR': {
  897. 'path': config['ARCHIVE_DIR'].resolve(),
  898. 'enabled': True,
  899. 'is_valid': config['ARCHIVE_DIR'].exists(),
  900. 'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
  901. },
  902. 'SOURCES_DIR': {
  903. 'path': config['SOURCES_DIR'].resolve(),
  904. 'enabled': True,
  905. 'is_valid': config['SOURCES_DIR'].exists(),
  906. },
  907. 'PERSONAS_DIR': {
  908. 'path': config['PERSONAS_DIR'].resolve(),
  909. 'enabled': True,
  910. 'is_valid': config['PERSONAS_DIR'].exists(),
  911. },
  912. 'LOGS_DIR': {
  913. 'path': config['LOGS_DIR'].resolve(),
  914. 'enabled': True,
  915. 'is_valid': config['LOGS_DIR'].exists(),
  916. },
  917. 'CACHE_DIR': {
  918. 'path': config['CACHE_DIR'].resolve(),
  919. 'enabled': True,
  920. 'is_valid': config['CACHE_DIR'].exists(),
  921. },
  922. 'CUSTOM_TEMPLATES_DIR': {
  923. 'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
  924. 'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
  925. 'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
  926. },
  927. # managed by bin/docker_entrypoint.sh and python-crontab:
  928. # 'CRONTABS_DIR': {
  929. # 'path': config['CRONTABS_DIR'].resolve(),
  930. # 'enabled': True,
  931. # 'is_valid': config['CRONTABS_DIR'].exists(),
  932. # },
  933. }
  934. def get_dependency_info(config: ConfigDict) -> ConfigValue:
  935. return {
  936. 'PYTHON_BINARY': {
  937. 'path': bin_path(config['PYTHON_BINARY']),
  938. 'version': config['PYTHON_VERSION'],
  939. 'hash': bin_hash(config['PYTHON_BINARY']),
  940. 'enabled': True,
  941. 'is_valid': bool(config['PYTHON_VERSION']),
  942. },
  943. 'SQLITE_BINARY': {
  944. 'path': bin_path(config['SQLITE_BINARY']),
  945. 'version': config['SQLITE_VERSION'],
  946. 'hash': bin_hash(config['SQLITE_BINARY']),
  947. 'enabled': True,
  948. 'is_valid': bool(config['SQLITE_VERSION']),
  949. },
  950. 'DJANGO_BINARY': {
  951. 'path': bin_path(config['DJANGO_BINARY']),
  952. 'version': config['DJANGO_VERSION'],
  953. 'hash': bin_hash(config['DJANGO_BINARY']),
  954. 'enabled': True,
  955. 'is_valid': bool(config['DJANGO_VERSION']),
  956. },
  957. 'ARCHIVEBOX_BINARY': {
  958. 'path': bin_path(config['ARCHIVEBOX_BINARY']),
  959. 'version': config['VERSION'],
  960. 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
  961. 'enabled': True,
  962. 'is_valid': True,
  963. },
  964. 'CURL_BINARY': {
  965. 'path': bin_path(config['CURL_BINARY']),
  966. 'version': config['CURL_VERSION'],
  967. 'hash': bin_hash(config['CURL_BINARY']),
  968. 'enabled': config['USE_CURL'],
  969. 'is_valid': bool(config['CURL_VERSION']),
  970. },
  971. 'WGET_BINARY': {
  972. 'path': bin_path(config['WGET_BINARY']),
  973. 'version': config['WGET_VERSION'],
  974. 'hash': bin_hash(config['WGET_BINARY']),
  975. 'enabled': config['USE_WGET'],
  976. 'is_valid': bool(config['WGET_VERSION']),
  977. },
  978. 'NODE_BINARY': {
  979. 'path': bin_path(config['NODE_BINARY']),
  980. 'version': config['NODE_VERSION'],
  981. 'hash': bin_hash(config['NODE_BINARY']),
  982. 'enabled': config['USE_NODE'],
  983. 'is_valid': bool(config['NODE_VERSION']),
  984. },
  985. 'SINGLEFILE_BINARY': {
  986. 'path': bin_path(config['SINGLEFILE_BINARY']),
  987. 'version': config['SINGLEFILE_VERSION'],
  988. 'hash': bin_hash(config['SINGLEFILE_BINARY']),
  989. 'enabled': config['USE_SINGLEFILE'],
  990. 'is_valid': bool(config['SINGLEFILE_VERSION']),
  991. },
  992. 'READABILITY_BINARY': {
  993. 'path': bin_path(config['READABILITY_BINARY']),
  994. 'version': config['READABILITY_VERSION'],
  995. 'hash': bin_hash(config['READABILITY_BINARY']),
  996. 'enabled': config['USE_READABILITY'],
  997. 'is_valid': bool(config['READABILITY_VERSION']),
  998. },
  999. 'MERCURY_BINARY': {
  1000. 'path': bin_path(config['MERCURY_BINARY']),
  1001. 'version': config['MERCURY_VERSION'],
  1002. 'hash': bin_hash(config['MERCURY_BINARY']),
  1003. 'enabled': config['USE_MERCURY'],
  1004. 'is_valid': bool(config['MERCURY_VERSION']),
  1005. },
  1006. 'GIT_BINARY': {
  1007. 'path': bin_path(config['GIT_BINARY']),
  1008. 'version': config['GIT_VERSION'],
  1009. 'hash': bin_hash(config['GIT_BINARY']),
  1010. 'enabled': config['USE_GIT'],
  1011. 'is_valid': bool(config['GIT_VERSION']),
  1012. },
  1013. 'YOUTUBEDL_BINARY': {
  1014. 'path': bin_path(config['YOUTUBEDL_BINARY']),
  1015. 'version': config['YOUTUBEDL_VERSION'],
  1016. 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
  1017. 'enabled': config['USE_YOUTUBEDL'],
  1018. 'is_valid': bool(config['YOUTUBEDL_VERSION']),
  1019. },
  1020. 'CHROME_BINARY': {
  1021. 'path': bin_path(config['CHROME_BINARY']),
  1022. 'version': config['CHROME_VERSION'],
  1023. 'hash': bin_hash(config['CHROME_BINARY']),
  1024. 'enabled': config['USE_CHROME'],
  1025. 'is_valid': bool(config['CHROME_VERSION']),
  1026. },
  1027. 'RIPGREP_BINARY': {
  1028. 'path': bin_path(config['RIPGREP_BINARY']),
  1029. 'version': config['RIPGREP_VERSION'],
  1030. 'hash': bin_hash(config['RIPGREP_BINARY']),
  1031. 'enabled': config['USE_RIPGREP'],
  1032. 'is_valid': bool(config['RIPGREP_VERSION']),
  1033. },
  1034. # TODO: add an entry for the sonic search backend?
  1035. # 'SONIC_BINARY': {
  1036. # 'path': bin_path(config['SONIC_BINARY']),
  1037. # 'version': config['SONIC_VERSION'],
  1038. # 'hash': bin_hash(config['SONIC_BINARY']),
  1039. # 'enabled': config['USE_SONIC'],
  1040. # 'is_valid': bool(config['SONIC_VERSION']),
  1041. # },
  1042. }
  1043. def get_chrome_info(config: ConfigDict) -> ConfigValue:
  1044. return {
  1045. 'TIMEOUT': config['TIMEOUT'],
  1046. 'RESOLUTION': config['RESOLUTION'],
  1047. 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
  1048. 'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
  1049. 'CHROME_TIMEOUT': config['CHROME_TIMEOUT'],
  1050. 'CHROME_HEADLESS': config['CHROME_HEADLESS'],
  1051. 'CHROME_SANDBOX': config['CHROME_SANDBOX'],
  1052. 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
  1053. 'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
  1054. }
  1055. # ******************************************************************************
  1056. # ******************************************************************************
  1057. # ******************************** Load Config *********************************
  1058. # ******* (compile the defaults, configs, and metadata all into CONFIG) ********
  1059. # ******************************************************************************
  1060. # ******************************************************************************
  1061. def load_all_config():
  1062. CONFIG: ConfigDict = ConfigDict()
  1063. for section_name, section_config in CONFIG_SCHEMA.items():
  1064. CONFIG = load_config(section_config, CONFIG)
  1065. return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG)
  1066. # add all final config values in CONFIG to globals in this file
  1067. CONFIG: ConfigDict = load_all_config()
  1068. globals().update(CONFIG)
  1069. # this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
  1070. # ******************************************************************************
  1071. # ******************************************************************************
  1072. # ******************************************************************************
  1073. # ******************************************************************************
  1074. # ******************************************************************************
  1075. ########################### System Environment Setup ###########################
  1076. # Set timezone to UTC and umask to OUTPUT_PERMISSIONS
  1077. assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # noqa: F821
  1078. os.environ["TZ"] = TIMEZONE # noqa: F821
  1079. os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
  1080. # add ./node_modules/.bin to $PATH so we can use node scripts in extractors
  1081. NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
  1082. sys.path.append(NODE_BIN_PATH)
  1083. # OPTIONAL: also look around the host system for node modules to use
  1084. # avoid enabling this unless absolutely needed,
  1085. # having overlapping potential sources of libs is a big source of bugs/confusing to users
  1086. # DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
  1087. # sys.path.append(DEV_NODE_BIN_PATH)
  1088. # USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
  1089. # sys.path.append(USER_NODE_BIN_PATH)
  1090. # disable stderr "you really shouldnt disable ssl" warnings with library config
  1091. if not CONFIG['CHECK_SSL_VALIDITY']:
  1092. import urllib3
  1093. requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
  1094. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  1095. # get SQLite database version, compile options, and runtime options
  1096. # TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
  1097. #cursor = sqlite3.connect(':memory:').cursor()
  1098. #DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
  1099. #DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
  1100. #DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
  1101. #cursor.close()
  1102. ########################### Config Validity Checkers ###########################
  1103. def check_system_config(config: ConfigDict=CONFIG) -> None:
  1104. ### Check system environment
  1105. if config['USER'] == 'root' or str(config['PUID']) == "0":
  1106. stderr('[!] ArchiveBox should never be run as root!', color='red')
  1107. stderr(' For more information, see the security overview documentation:')
  1108. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
  1109. if config['IN_DOCKER']:
  1110. attempted_command = ' '.join(sys.argv[:3])
  1111. stderr('')
  1112. stderr(' {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
  1113. stderr(f' docker compose run archivebox {attempted_command}')
  1114. stderr(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}')
  1115. stderr(' or:')
  1116. stderr(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"')
  1117. stderr(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"')
  1118. raise SystemExit(2)
  1119. ### Check Python environment
  1120. if sys.version_info[:3] < (3, 7, 0):
  1121. stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
  1122. stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
  1123. raise SystemExit(2)
  1124. if int(CONFIG['DJANGO_VERSION'].split('.')[0]) < 3:
  1125. stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red')
  1126. stderr(' Upgrade django using pip or your system package manager: pip3 install --upgrade django')
  1127. raise SystemExit(2)
  1128. if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
  1129. stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
  1130. stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
  1131. stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
  1132. stderr('')
  1133. stderr(' Confirm that it\'s fixed by opening a new shell and running:')
  1134. stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
  1135. raise SystemExit(2)
  1136. # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
  1137. # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
  1138. if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
  1139. if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
  1140. stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
  1141. stderr(f' {config["CHROME_USER_DATA_DIR"]}')
  1142. stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
  1143. stderr(' For more info see:')
  1144. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
  1145. if '/Default' in str(config['CHROME_USER_DATA_DIR']):
  1146. stderr()
  1147. stderr(' Try removing /Default from the end e.g.:')
  1148. stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
  1149. # hard error is too annoying here, instead just set it to nothing
  1150. # raise SystemExit(2)
  1151. config['CHROME_USER_DATA_DIR'] = None
  1152. else:
  1153. config['CHROME_USER_DATA_DIR'] = None
  1154. def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
  1155. invalid_dependencies = [
  1156. (name, info) for name, info in config['DEPENDENCIES'].items()
  1157. if info['enabled'] and not info['is_valid']
  1158. ]
  1159. if invalid_dependencies and show_help:
  1160. stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
  1161. for dependency, info in invalid_dependencies:
  1162. stderr(
  1163. ' ! {}: {} ({})'.format(
  1164. dependency,
  1165. info['path'] or 'unable to find binary',
  1166. info['version'] or 'unable to detect version',
  1167. )
  1168. )
  1169. if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
  1170. hint(('To install all packages automatically run: archivebox setup',
  1171. f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
  1172. ''), prefix=' ')
  1173. stderr('')
  1174. if config['TIMEOUT'] < 5:
  1175. stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
  1176. stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
  1177. stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
  1178. stderr()
  1179. stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  1180. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
  1181. stderr()
  1182. elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
  1183. stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
  1184. stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
  1185. stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
  1186. stderr()
  1187. stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  1188. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
  1189. stderr()
  1190. if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
  1191. stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
  1192. stderr(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
  1193. stderr(' (Setting it somewhere over 60 seconds is recommended)')
  1194. stderr()
  1195. stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
  1196. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
  1197. stderr()
  1198. def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
  1199. output_dir = out_dir or config['OUTPUT_DIR']
  1200. assert isinstance(output_dir, (str, Path))
  1201. archive_dir_exists = (Path(output_dir) / ARCHIVE_DIR_NAME).exists()
  1202. if not archive_dir_exists:
  1203. stderr('[X] No archivebox index found in the current directory.', color='red')
  1204. stderr(f' {output_dir}', color='lightyellow')
  1205. stderr()
  1206. stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
  1207. stderr(' cd path/to/your/archive/folder')
  1208. stderr(' archivebox [command]')
  1209. stderr()
  1210. stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
  1211. stderr(' archivebox init')
  1212. raise SystemExit(2)
  1213. def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
  1214. output_dir = out_dir or config['OUTPUT_DIR']
  1215. from .index.sql import list_migrations
  1216. pending_migrations = [name for status, name in list_migrations() if not status]
  1217. if pending_migrations:
  1218. stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
  1219. stderr(f' {output_dir}')
  1220. stderr()
  1221. stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
  1222. stderr(' archivebox init')
  1223. raise SystemExit(3)
  1224. (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
  1225. (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
  1226. (Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True)
  1227. (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
  1228. (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
  1229. def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
  1230. check_system_config()
  1231. output_dir = out_dir or Path(config['OUTPUT_DIR'])
  1232. assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
  1233. try:
  1234. from django.core.management import call_command
  1235. sys.path.append(str(config['PACKAGE_DIR']))
  1236. os.environ.setdefault('OUTPUT_DIR', str(output_dir))
  1237. assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
  1238. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
  1239. # Check to make sure JSON extension is available in our Sqlite3 instance
  1240. try:
  1241. cursor = sqlite3.connect(':memory:').cursor()
  1242. cursor.execute('SELECT JSON(\'{"a": "b"}\')')
  1243. except sqlite3.OperationalError as exc:
  1244. stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
  1245. hint([
  1246. 'Upgrade your Python version or install the extension manually:',
  1247. 'https://code.djangoproject.com/wiki/JSON1Extension'
  1248. ])
  1249. if in_memory_db:
  1250. # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
  1251. # in those cases we create a temporary in-memory db and run the migrations
  1252. # immediately to get a usable in-memory-database at startup
  1253. os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
  1254. django.setup()
  1255. call_command("migrate", interactive=False, verbosity=0)
  1256. else:
  1257. # Otherwise use default sqlite3 file-based database and initialize django
  1258. # without running migrations automatically (user runs them manually by calling init)
  1259. django.setup()
  1260. from django.conf import settings
  1261. # log startup message to the error log
  1262. with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
  1263. command = ' '.join(sys.argv)
  1264. ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
  1265. f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
  1266. if check_db:
  1267. # Enable WAL mode in sqlite3
  1268. from django.db import connection
  1269. with connection.cursor() as cursor:
  1270. # Set Journal mode to WAL to allow for multiple writers
  1271. current_mode = cursor.execute("PRAGMA journal_mode")
  1272. if current_mode != 'wal':
  1273. cursor.execute("PRAGMA journal_mode=wal;")
  1274. # Set max blocking delay for concurrent writes and write sync mode
  1275. # https://litestream.io/tips/#busy-timeout
  1276. cursor.execute("PRAGMA busy_timeout = 5000;")
  1277. cursor.execute("PRAGMA synchronous = NORMAL;")
  1278. # Create cache table in DB if needed
  1279. try:
  1280. from django.core.cache import cache
  1281. cache.get('test', None)
  1282. except django.db.utils.OperationalError:
  1283. call_command("createcachetable", verbosity=0)
  1284. # if archivebox gets imported multiple times, we have to close
  1285. # the sqlite3 whenever we init from scratch to avoid multiple threads
  1286. # sharing the same connection by accident
  1287. from django.db import connections
  1288. for conn in connections.all():
  1289. conn.close_if_unusable_or_obsolete()
  1290. sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
  1291. assert sql_index_path.exists(), (
  1292. f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
  1293. except KeyboardInterrupt:
  1294. raise SystemExit(2)