main.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import argparse
  2. import os
  3. import json
  4. import crawler
  5. # Gestion des parametres
  6. parser = argparse.ArgumentParser(description='Crawler pour la creation de site map')
  7. parser.add_argument('--skipext', action="append", default=[], required=False, help="File extension to skip")
  8. parser.add_argument('--parserobots', action="store_true", default=False, required=False, help="Ignore file defined in robots.txt")
  9. parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
  10. parser.add_argument('-v', '--verbose', action="store_true", help="Enable verbose output")
  11. parser.add_argument('--output', action="store", default=None, help="Output file")
  12. parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
  13. parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
  14. parser.add_argument('--report', action="store_true", default=False, required=False, help="Display a report")
  15. parser.add_argument('--images', action="store_true", default=False, required=False, help="Add image to sitemap.xml (see https://support.google.com/webmasters/answer/178636?hl=en)")
  16. group = parser.add_mutually_exclusive_group()
  17. group.add_argument('--config', action="store", default=None, help="Configuration file in json format")
  18. group.add_argument('--domain', action="store", default="", help="Target domain (ex: http://blog.lesite.us)")
  19. arg = parser.parse_args()
  20. # Read the config file if needed
  21. if arg.config is not None:
  22. try:
  23. config_data=open(arg.config,'r')
  24. config = json.load(config_data)
  25. config_data.close()
  26. except Exception as e:
  27. config = {}
  28. else:
  29. config = {}
  30. # Overload config with flag parameters
  31. dict_arg = arg.__dict__
  32. for argument in config:
  33. if argument in dict_arg:
  34. if type(dict_arg[argument]).__name__ == 'list':
  35. dict_arg[argument].extend(config[argument])
  36. elif type(dict_arg[argument]).__name__ == 'bool':
  37. if dict_arg[argument]:
  38. dict_arg[argument] = True
  39. else:
  40. dict_arg[argument] = config[argument]
  41. else:
  42. dict_arg[argument] = config[argument]
  43. del(dict_arg['config'])
  44. crawl = crawler.Crawler(**dict_arg)
  45. crawl.run()
  46. if arg.report:
  47. crawl.make_report()