# website image extraction code by Ryan Kulla, rkulla@gmail.com import gl from cursor import wait_cursor, normal_cursor from show_message import show_message from img_screen import get_center, paint_screen from load_img import load_img from input_box import ask from HTMLParser import HTMLParser import htmllib import urllib2 import formatter from pygame.display import set_caption class CheckIndexHTML(HTMLParser): def handle_data(self, data): if data == "404 Not Found" or data == "404 Error": gl.INDEX_OF = True def check_indexhtml(): gl.INDEX_OF = False parser = CheckIndexHTML() try: # check if URL is using an index.html or "Index Of Files" if not gl.URL.count('/') <= 3 and not gl.URL.endswith("index.html") and not\ gl.URL[gl.URL.rfind('/'):].count('.'): gl.URL = gl.URL + '/' html = urllib2.urlopen(gl.URL + "index.html").read() else: html = urllib2.urlopen(gl.URL).read() parser.feed(html) parser.close() except: gl.URL_ERROR = True return if gl.INDEX_OF == True: index_of_files() else: html_file() class IndexOfParser(HTMLParser): def handle_starttag(self, tag, attrs): for i in attrs: try: for ext in gl.IMG_TYPES: if i[1].find(ext) != -1 and i[1].find("/icons") == -1: gl.SHOW_DATA = True except: pass def handle_data(self, data): if gl.SHOW_DATA == True: gl.files.append(gl.URL + data) gl.SHOW_DATA = False def index_of_files(): parser = IndexOfParser() try: html = urllib2.urlopen(gl.URL).read() parser.feed(html) parser.close() except: gl.URL_ERROR = True return class TagParse(htmllib.HTMLParser): def handle_image(self, source, alt, ismap, align, width, height): for i in gl.IMG_TYPES: if source.find(i) != -1 and source.split('.')[-1] == i[1:] and\ source[:4] != "http" and source[:4] != "www.": if source[0] == '/': source = source[1:] if gl.URL.count('/') >= 3: if gl.URL.split('/')[-1].find('.') != -1: # get rid of any bla.html type endings gl.URL = gl.URL[:gl.URL.index(gl.URL.split('/')[gl.URL.count('/')])] if gl.URL[-1] == '/': gl.URL = gl.URL[:-1] print_links(source, alt) if source.find("http") != -1: gl.JUST_SOURCE = 1 if gl.URL[-1] == '/': gl.URL = gl.URL[:-1] print_links(source, alt) def print_links(source, alt): text_list = [] if gl.JUST_SOURCE: link_name = source text_list.append(link_name) if text_list.count(link_name) == 1: if not link_name.startswith("http://"): link_name = "%s/%s" % (gl.URL, link_name) if not link_name in gl.files: gl.files.append(link_name) else: link_name = gl.URL + '/' + source text_list.append(link_name) if text_list.count(link_name) == 1: if not link_name.startswith("http://"): link_name = "%s/%s" % (gl.URL, link_name) if not link_name in gl.files: gl.files.append(link_name) def html_file(): parser = TagParse(formatter.NullFormatter()) try: html = urllib2.urlopen(gl.URL).read() parser.feed(html) parser.close() except: gl.URL_ERROR = True return def open_url(screen, img): gl.ISURL = 1 num_imgs = len(gl.files) paint_screen(screen, gl.BLACK) set_caption("Extract from Web - imgv") normal_cursor() show_message(screen, "Enter a Web URL to extract images from", 20, 15, ("transparent")) gl.URL = ask(screen, "http://") if gl.URL != None: gl.files = [] wait_cursor() show_message(screen, "Loading. Please wait..", 39, 42, ("transparent")) for ext in gl.IMG_TYPES: if gl.URL.endswith(ext): gl.files.append(str(''.join(gl.URL))) return (load_img(gl.files[0], screen), 1) else: return (img, num_imgs) gl.files = [] check_indexhtml() if gl.URL_ERROR: gl.files.append(gl.ERROR_IMG) return (load_img(gl.ERROR_IMG, screen), len(gl.files)) if len(gl.files) < 1: gl.files.append(gl.ERROR_IMG) gl.files = [x.replace(' ', '%20') for x in gl.files] # urls need %20 for spaces return (load_img(gl.files[0], screen), len(gl.files))