1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
| import codecs import os import re import time
import requests from bs4 import BeautifulSoup from threadpool import ThreadPool, makeRequests
DOWNLOAD_ROOT = '../rom-download/completeroms/' SUPPORT_TYPES = { 'nintendo': 64, 'gameboy-color': 23, 'game-gear': 14, }
class RomDownloader: def __init__(self, types): self.img_root = '' self.pool_rom_list = [] self.download_types = types
def start_fetch(self): for my_type in self.download_types: if my_type not in SUPPORT_TYPES: continue max_page = SUPPORT_TYPES[my_type] rom_list = [] count = 0 for i in range(0, max_page): index_url = 'http://www.completeroms.com/roms/%s/%d' % (my_type, i + 1) req = requests.get(index_url) bs = BeautifulSoup(req.text, features='html5lib') tr_nodes = bs.find('table', class_='table').find_all('tr') for tr_node in tr_nodes: a_node = tr_node.find('a') if a_node is None: continue detail_url = a_node.attrs['href'] last_id = detail_url.rfind('/') + 1 rom_id = detail_url[last_id:] if not rom_id: continue rom_info = { 'index': count, 'rom_id': rom_id, } rom_list.append(rom_info) count += 1 print 'parse index ok[%s] total:[%d]' % (index_url, count) self.parse_download_info_async(rom_list) self.write_url_map_csv(my_type) self.download_img_async(my_type) self.download_roms_slowly(my_type)
def parse_download_info_async(self, rom_list): print 'start parse download info, count[%d]' % len(rom_list) self.pool_rom_list = [] pool = ThreadPool(64) pool_requests = makeRequests(self.get_download_url_pool, rom_list, self.get_download_url_done, self.get_download_url_error) [pool.putRequest(req) for req in pool_requests] pool.wait() print 'parse download info, count[%d]' % len(self.pool_rom_list)
@staticmethod def get_download_url_error(req_args, error_info): print 'url error:[%s]' % req_args.args[0]['final_url'] print error_info
@staticmethod def get_download_url_pool(rom_info): rom_id = rom_info['rom_id'] final_url = 'http://www.completeroms.com/thankyou.php?id=%s' % rom_id rom_info['final_url'] = final_url headers = { 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0' } req = requests.get(final_url, headers=headers, timeout=60) bs = BeautifulSoup(req.text, features='html5lib') fname_node = bs.find('h3') if fname_node: file_name = fname_node.text.strip() rom_info['file_name'] = file_name img_node = bs.find('img', class_='rom-cover') if img_node: img_url = img_node.attrs['src'] rom_info['img_url'] = img_url last_dot = img_url.rfind('.') ext_name = img_url[last_dot:].lower() last_dot = file_name.rfind('.') simple_name = file_name[0:last_dot] rom_info['simple_name'] = simple_name img_name = simple_name + ext_name rom_info['img_name'] = img_name ptn = re.compile(r'.*var url = "(?P<zip_url>.*)";.*') m = ptn.search(req.text) if hasattr(m, 'group'): zip_url = m.group('zip_url') rom_info['file_url'] = zip_url return rom_info
def write_url_map_csv(self, my_type): csv_root = DOWNLOAD_ROOT + my_type + '/' if not os.path.exists(csv_root): os.makedirs(csv_root) time_str = time.strftime('%Y-%m-%d-%H%M%S', time.localtime()) log_path = '%surl-map-%s.csv' % (csv_root, time_str) csv_writer = codecs.open(log_path, 'w', 'utf-8') csv_writer.write('Name,Rom Url,Image Url,Referer Url\n') for rom_info in self.pool_rom_list: if 'simple_name' not in rom_info: print 'no simple name[%s]' % rom_info['file_name'] continue csv_writer.write('"%s","%s","%s","%s"\n' % (rom_info['simple_name'] or '', rom_info['file_url'], rom_info['img_url'], rom_info['final_url'])) csv_writer.close() print 'write csv file ok[%s]' % my_type
def get_download_url_done(self, req_args, rom_info): if rom_info is None: print 'get download url error!' index = rom_info['index'] file_name = rom_info['file_name'] print 'get download url ok[%3d.%s]' % (index, file_name) self.pool_rom_list.append(rom_info)
def download_img_async(self, my_type): self.img_root = DOWNLOAD_ROOT + my_type + '/images/' if not os.path.exists(self.img_root): os.makedirs(self.img_root) print 'start download images, count[%d]' % len(self.pool_rom_list) pool = ThreadPool(64) pool_requests = makeRequests(self.download_img_in_pool, self.pool_rom_list, self.download_img_done, self.download_img_error) [pool.putRequest(req) for req in pool_requests] pool.wait() print 'download images ok, count[%d]' % len(self.pool_rom_list)
@staticmethod def download_img_error(req_args, error_info): args = req_args.args[0] if 'img_url' in args: print 'download image error:[%s]' % args['img_url'] print error_info
def download_img_in_pool(self, rom_info): img_name = rom_info['img_name'] img_url = rom_info['img_url'] img_path = self.img_root + img_name if os.path.exists(img_path): return rom_info r = requests.get(img_url, timeout=60) with open(img_path, "wb") as code: code.write(r.content) return rom_info
@staticmethod def download_img_done(req_args, rom_info): index = rom_info['index'] img_name = rom_info['img_name'] print 'finish download image[%d][%s]' % (index, img_name)
def download_roms_slowly(self, my_type): roms_root = DOWNLOAD_ROOT + my_type + '/roms/' if not os.path.exists(roms_root): os.makedirs(roms_root) print 'start download roms, count[%d]' % len(self.pool_rom_list) for rom_info in self.pool_rom_list: file_name = rom_info['file_name'] rom_path = roms_root + file_name if os.path.exists(rom_path): print 'exists rom[%s]' % file_name continue final_url = rom_info['final_url'] file_url = rom_info['file_url'] headers = { 'Referer': final_url, 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0' } try: r = requests.get(file_url, headers=headers, timeout=300) with open(rom_path, "wb") as code: code.write(r.content) print 'download rom ok sleep 10s[%3d.%s]' % (rom_info['index'], file_name) time.sleep(6) except Exception, e: print 'exception[%s][%s]' % (rom_info['file_name'], e)
if __name__ == '__main__': print ('start fetch!!') download_types = [ 'gameboy-color', 'nintendo', 'game-gear', ] downloader = RomDownloader(download_types) downloader.start_fetch()
|