#!/usr/bin/env python
# coding: utf-8
import urllib
def filter_src(file_name):
resource_list = []
f_obj = open(file_name)
for f_line in f_obj:
if '404' in f_line:
str_goal = f_line.strip().split(' ')[7]
if not str_goal in resource_list:
print str_goal
if '/static' in str_goal:
str_goal = str_goal.replace('/static', '')
resource_list.append(str_goal[:-1])
print resource_list
return resource_list
def down_src(source_list):
base_url = "http://www.ttcrm.com"
down_path = r"src"
for source in source_list:
source_url = base_url + source
source_path = down_path + source
print source_url
source_stram = urllib.urlopen(source_url)
f_obj = open(source_path,'wb')
f_obj.write(source_stram.read())
if __name__=='__main__':
file_name = 'src.txt'
source_list = filter_src(file_name)
down_src(source_list)
关键点在于保存是以二进制方式保存!
f_obj = open(source_path,'wb')
f_obj.write(source_stram.read())