#!/usr/bin/env python
#-*- coding:utf-8 -*-
import os
import urllib2
import sys
from BeautifulSoup import BeautifulSoup
try:
url = sys.argv[1]
except IndexError:
print "geh - A simeple g.e-hentai downloader\nUsage: geh.py [url]"
def find_next_page_link(tag):
try:
if tag.name == 'a' and tag.text == '>': return True
return False
except TypeError:
return False
def find_image(html):
soup = BeautifulSoup(html)
return soup.find('img', {'id': 'img'})['src']
def parse_index(url):
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
title = soup.h1.text
next_page_url = soup.find(find_next_page_link)
image_list = [node['href'] for node in soup.find('div', {'id': 'gdt'}).findAll('a')]
return title, next_page_url, image_list
c = 1
while True:
title, next_page_url, image_list = parse_index(url)
dst_dir = title
if not os.path.exists(dst_dir):
os.mkdir(dst_dir)
for page in image_list:
html = urllib2.urlopen(page).read()
image_url = find_image(html)
fn = '{0}.{1}'.format(str(c).zfill(3), image_url.split('.')[-1])
fn = os.path.join(dst_dir, fn)
print '{0}: {1} ... '.format(dst_dir, str(c).zfill(3)),
with file(fn, 'wb') as f:
image_data = urllib2.urlopen(image_url).read()
f.write(image_data)
print 'done'
c += 1
if not next_page_url:
break
url = next_page_url['href']