A framework for quick web archiving
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

84 行
2.8 KiB

  1. import argparse
  2. import asyncio
  3. import importlib.util
  4. import logging
  5. import os.path
  6. import qwarc
  7. import sys
  8. import time
  9. def setup_logging(logFilename):
  10. rootLogger = logging.getLogger()
  11. rootLogger.handlers = []
  12. rootLogger.setLevel(logging.INFO)
  13. formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s', datefmt = '%Y-%m-%d %H:%M:%S')
  14. formatter.converter = time.gmtime
  15. fileHandler = logging.FileHandler(logFilename)
  16. fileHandler.setFormatter(formatter)
  17. rootLogger.addHandler(fileHandler)
  18. stderrHandler = logging.StreamHandler()
  19. stderrHandler.setFormatter(formatter)
  20. rootLogger.addHandler(stderrHandler)
  21. def check_files(specFilename, logFilename):
  22. success = True
  23. if not os.path.isfile(specFilename):
  24. print('Error: "{}" does not exist or is not a regular file', file = sys.stderr)
  25. success = False
  26. if os.path.exists(logFilename):
  27. print('Error: "{}" already exists'.format(logFilename), file = sys.stderr)
  28. success = False
  29. if os.path.exists('STOP'):
  30. print('Error: "STOP" exists', file = sys.stderr)
  31. success = False
  32. return success
  33. def main():
  34. parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  35. parser.add_argument('--log', metavar = 'LOGFILE', default = './qwarc.log')
  36. parser.add_argument('--database', metavar = 'DBFILE', default = './qwarc.db')
  37. parser.add_argument('--warc', metavar = 'PREFIX', help = 'prefix for the WARC filenames', default = './qwarc')
  38. parser.add_argument('--concurrency', type = int, default = 1)
  39. parser.add_argument('--memorylimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes memory is free; disable if 0', default = 0)
  40. parser.add_argument('--disklimit', metavar = 'LIMIT', help = 'pause when less than LIMIT bytes disk space is free; disable if 0', default = 0)
  41. parser.add_argument('--warcsplit', metavar = 'SIZE', help = 'split WARCs into files of SIZE bytes; disable if 0', default = 0)
  42. parser.add_argument('--warcdedupe', action = 'store_true', help = 'enable deduplication of WARC records')
  43. parser.add_argument('specfile')
  44. args = parser.parse_args()
  45. if not check_files(args.specfile, args.log):
  46. sys.exit(1)
  47. setup_logging(args.log)
  48. spec = importlib.util.spec_from_file_location('spec', args.specfile)
  49. specMod = importlib.util.module_from_spec(spec)
  50. spec.loader.exec_module(specMod)
  51. a = qwarc.QWARC(
  52. itemClasses = qwarc.Item.__subclasses__(),
  53. warcBasePath = args.warc,
  54. dbPath = args.database,
  55. concurrency = args.concurrency,
  56. memoryLimit = args.memorylimit,
  57. minFreeDisk = args.disklimit,
  58. warcSizeLimit = args.warcsplit,
  59. warcDedupe = args.warcdedupe,
  60. )
  61. if not os.path.exists(args.database):
  62. a.create_db()
  63. loop = asyncio.get_event_loop()
  64. try:
  65. loop.run_until_complete(a.run(loop))
  66. except (Exception, KeyboardInterrupt) as e:
  67. logging.exception('Unhandled error')
  68. loop.close()