crawler.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. import config
  2. import logging
  3. from urllib.parse import urljoin, urlunparse
  4. import re
  5. from urllib.parse import urlparse
  6. from urllib.request import urlopen, Request
  7. from urllib.robotparser import RobotFileParser
  8. from datetime import datetime
  9. import mimetypes
  10. import os
  11. class Crawler():
  12. # Variables
  13. parserobots = False
  14. output = None
  15. report = False
  16. config = None
  17. domain = ""
  18. exclude = []
  19. skipext = []
  20. drop = []
  21. debug = False
  22. tocrawl = set([])
  23. crawled = set([])
  24. excluded = set([])
  25. marked = {}
  26. not_parseable_ressources = (".epub", ".mobi", ".docx", ".doc", ".opf", ".7z", ".ibooks", ".cbr", ".avi", ".mkv", ".mp4", ".jpg", ".jpeg", ".png", ".gif" ,".pdf", ".iso", ".rar", ".tar", ".tgz", ".zip", ".dmg", ".exe")
  27. linkregex = re.compile(b'<a [^>]*href=[\'|"](.*?)[\'"][^>]*?>')
  28. imageregex = re.compile (b'<img [^>]*src=[\'|"](.*?)[\'"].*?>')
  29. rp = None
  30. response_code={}
  31. nb_url=1 # Number of url.
  32. nb_rp=0 # Number of url blocked by the robots.txt
  33. nb_exclude=0 # Number of url excluded by extension or word
  34. output_file = None
  35. target_domain = ""
  36. scheme = ""
  37. def __init__(self, parserobots=False, output=None, report=False ,domain="",
  38. exclude=[], skipext=[], drop=[], debug=False, verbose=False, images=False):
  39. self.parserobots = parserobots
  40. self.output = output
  41. self.report = report
  42. self.domain = domain
  43. self.exclude = exclude
  44. self.skipext = skipext
  45. self.drop = drop
  46. self.debug = debug
  47. self.verbose = verbose
  48. self.images = images
  49. if self.debug:
  50. log_level = logging.DEBUG
  51. elif self.verbose:
  52. log_level = logging.INFO
  53. else:
  54. log_level = logging.ERROR
  55. logging.basicConfig(level=log_level)
  56. self.tocrawl = set([self.clean_link(domain)])
  57. try:
  58. url_parsed = urlparse(domain)
  59. self.target_domain = url_parsed.netloc
  60. self.scheme = url_parsed.scheme
  61. except:
  62. logging.error("Invalide domain")
  63. raise ("Invalid domain")
  64. if self.output:
  65. try:
  66. self.output_file = open(self.output, 'w')
  67. except:
  68. logging.error ("Output file not available.")
  69. exit(255)
  70. def run(self):
  71. print(config.xml_header, file=self.output_file)
  72. if self.parserobots:
  73. self.check_robots()
  74. logging.info("Start the crawling process")
  75. while len(self.tocrawl) != 0:
  76. self.__crawling()
  77. logging.info("Crawling has reached end of all found links")
  78. print (config.xml_footer, file=self.output_file)
  79. def __crawling(self):
  80. crawling = self.tocrawl.pop()
  81. url = urlparse(crawling)
  82. self.crawled.add(crawling)
  83. logging.info("Crawling #{}: {}".format(len(self.crawled), url.geturl()))
  84. request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
  85. # Ignore ressources listed in the not_parseable_ressources
  86. # Its avoid dowloading file like pdf… etc
  87. if not url.path.endswith(self.not_parseable_ressources):
  88. try:
  89. response = urlopen(request)
  90. except Exception as e:
  91. if hasattr(e,'code'):
  92. if e.code in self.response_code:
  93. self.response_code[e.code]+=1
  94. else:
  95. self.response_code[e.code]=1
  96. # Gestion des urls marked pour le reporting
  97. if self.report:
  98. if e.code in self.marked:
  99. self.marked[e.code].append(crawling)
  100. else:
  101. self.marked[e.code] = [crawling]
  102. logging.debug ("{1} ==> {0}".format(e, crawling))
  103. return self.__continue_crawling()
  104. else:
  105. logging.debug("Ignore {0} content might be not parseable.".format(crawling))
  106. response = None
  107. # Read the response
  108. if response is not None:
  109. try:
  110. msg = response.read()
  111. if response.getcode() in self.response_code:
  112. self.response_code[response.getcode()]+=1
  113. else:
  114. self.response_code[response.getcode()]=1
  115. response.close()
  116. # Get the last modify date
  117. if 'last-modified' in response.headers:
  118. date = response.headers['Last-Modified']
  119. else:
  120. date = response.headers['Date']
  121. date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
  122. except Exception as e:
  123. logging.debug ("{1} ===> {0}".format(e, crawling))
  124. return None
  125. else:
  126. # Response is None, content not downloaded, just continu and add
  127. # the link to the sitemap
  128. msg = "".encode( )
  129. date = None
  130. # Image sitemap enabled ?
  131. image_list = "";
  132. if self.images:
  133. # Search for images in the current page.
  134. images = self.imageregex.findall(msg)
  135. for image_link in list(set(images)):
  136. image_link = image_link.decode("utf-8", errors="ignore")
  137. # Ignore link starting with data:
  138. if image_link.startswith("data:"):
  139. continue
  140. # If path start with // get the current url scheme
  141. if image_link.startswith("//"):
  142. image_link = url.scheme + ":" + image_link
  143. # Append domain if not present
  144. elif not image_link.startswith(("http", "https")):
  145. if not image_link.startswith("/"):
  146. image_link = "/{0}".format(image_link)
  147. image_link = "{0}{1}".format(self.domain.strip("/"), image_link.replace("./", "/"))
  148. # Ignore image if path is in the exclude_url list
  149. if not self.exclude_url(image_link):
  150. continue
  151. # Ignore other domain images
  152. image_link_parsed = urlparse(image_link)
  153. if image_link_parsed.netloc != self.target_domain:
  154. continue
  155. # Test if images as been already seen and not present in the
  156. # robot file
  157. if self.can_fetch(image_link):
  158. logging.debug("Found image : {0}".format(image_link))
  159. image_list = "{0}<image:image><image:loc>{1}</image:loc></image:image>".format(image_list, self.htmlspecialchars(image_link))
  160. # Last mod fetched ?
  161. lastmod = ""
  162. if date:
  163. lastmod = "<lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod>"
  164. print ("<url><loc>"+self.htmlspecialchars(url.geturl())+"</loc>" + lastmod + image_list + "</url>", file=self.output_file)
  165. if self.output_file:
  166. self.output_file.flush()
  167. # Found links
  168. links = self.linkregex.findall(msg)
  169. for link in links:
  170. link = link.decode("utf-8", errors="ignore")
  171. link = self.clean_link(link)
  172. logging.debug("Found : {0}".format(link))
  173. if link.startswith('/'):
  174. link = url.scheme + '://' + url[1] + link
  175. elif link.startswith('#'):
  176. link = url.scheme + '://' + url[1] + url[2] + link
  177. elif link.startswith(("mailto", "tel")):
  178. continue
  179. elif not link.startswith(('http', "https")):
  180. link = url.scheme + '://' + url[1] + '/' + link
  181. # Remove the anchor part if needed
  182. if "#" in link:
  183. link = link[:link.index('#')]
  184. # Drop attributes if needed
  185. for toDrop in self.drop:
  186. link=re.sub(toDrop,'',link)
  187. # Parse the url to get domain and file extension
  188. parsed_link = urlparse(link)
  189. domain_link = parsed_link.netloc
  190. target_extension = os.path.splitext(parsed_link.path)[1][1:]
  191. if link in self.crawled:
  192. continue
  193. if link in self.tocrawl:
  194. continue
  195. if link in self.excluded:
  196. continue
  197. if domain_link != self.target_domain:
  198. continue
  199. if parsed_link.path in ["", "/"]:
  200. continue
  201. if "javascript" in link:
  202. continue
  203. if self.is_image(parsed_link.path):
  204. continue
  205. if parsed_link.path.startswith("data:"):
  206. continue
  207. # Count one more URL
  208. self.nb_url+=1
  209. # Check if the navigation is allowed by the robots.txt
  210. if not self.can_fetch(link):
  211. self.exclude_link(link)
  212. self.nb_rp+=1
  213. continue
  214. # Check if the current file extension is allowed or not.
  215. if (target_extension in self.skipext):
  216. self.exclude_link(link)
  217. self.nb_exclude+=1
  218. continue
  219. # Check if the current url doesn't contain an excluded word
  220. if (not self.exclude_url(link)):
  221. self.exclude_link(link)
  222. self.nb_exclude+=1
  223. continue
  224. self.tocrawl.add(link)
  225. return None
  226. def clean_link(self, link):
  227. l = urlparse(link)
  228. l_res = list(l)
  229. l_res[2] = l_res[2].replace("./", "/")
  230. l_res[2] = l_res[2].replace("//", "/")
  231. return urlunparse(l_res)
  232. def is_image(self, path):
  233. mt,me = mimetypes.guess_type(path)
  234. return mt is not None and mt.startswith("image/")
  235. def __continue_crawling(self):
  236. if self.tocrawl:
  237. self.__crawling()
  238. def exclude_link(self,link):
  239. if link not in self.excluded:
  240. self.excluded.add(link)
  241. def check_robots(self):
  242. robots_url = urljoin(self.domain, "robots.txt")
  243. self.rp = RobotFileParser()
  244. self.rp.set_url(robots_url)
  245. self.rp.read()
  246. def can_fetch(self, link):
  247. try:
  248. if self.parserobots:
  249. if self.rp.can_fetch("*", link):
  250. return True
  251. else:
  252. logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
  253. return False
  254. if not self.parserobots:
  255. return True
  256. return True
  257. except:
  258. # On error continue!
  259. logging.debug ("Error during parsing robots.txt")
  260. return True
  261. def exclude_url(self, link):
  262. for ex in self.exclude:
  263. if ex in link:
  264. return False
  265. return True
  266. def htmlspecialchars(self, text):
  267. return text.replace("&", "&amp;").replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;")
  268. def make_report(self):
  269. print ("Number of found URL : {0}".format(self.nb_url))
  270. print ("Number of link crawled : {0}".format(len(self.crawled)))
  271. if self.parserobots:
  272. print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
  273. if self.skipext or self.exclude:
  274. print ("Number of link exclude : {0}".format(self.nb_exclude))
  275. for code in self.response_code:
  276. print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
  277. for code in self.marked:
  278. print ("Link with status {0}:".format(code))
  279. for uri in self.marked[code]:
  280. print ("\t- {0}".format(uri))