python wget url设置header

转载

mob6454cc7203e2 2024-09-12 13:52:07

文章标签 python 爬虫文件名缓存文件 文章分类 Python 后端开发

记性不好，又是学习新东西，自己写给自己看吧，有时间再详细弄弄。

声明一遍：这个是站内的开源爬虫程序urlwatch1.7~~

程序没有跑过，先把源码下下来学习了，接触到了distutils、optparse，又巩固了下os.path。

urlwatch文件用python写成，不过没有加py后缀。

前面都是一些说明和变量注释，从53行开始，把一些常用的方法于函数过了一遍。

# os.path.expanduser调用后，返回的urlwatch_dir就是～／urlwatcher目录
# 通过调用join方法可以使得路径的分隔符不受系统限制，win上是'\'，unix是'/'
 53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
 54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
 55 cache_dir = os.path.join(urlwatch_dir, 'cache')
 56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
 57 hooks_py = os.path.join(scripts_dir, 'hooks.py') 58
# sys.argv[0]传入的脚本名称
# os.path.abspath取脚本的绝对路径
# os.path.dirname取除去文件名后的目录名
# os.path.split分隔文件名和目录名,用个tuple来装，bindir是末尾的文件目录 59 # Check if we are installed in the system already
 60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0]))) 
# sys.path.append是将目录加入python系统目录，使其能够访问lib目录里的文件
 68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
# logging日志模块
# 设置日志级别 94 log = logging.getLogger(pkgname)
 95 log.setLevel(logging.DEBUG)# 添加空handler,不知有什么用，待查
101 log.addHandler(NullHandler())
# 在type.upper()和url之间插入“：“
114     summary_txt = ': '.join((type.upper(), url))
# main开始
130 if __name__ == '__main__':
# 取当前系统时间
131     start = datetime.datetime.now()
 
再接下来就是郁闷我很久的optparse
# 先出个OptionParser,附上说明
134     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.    strip(), version=pkgname+' '+__version__)
# 如果出现传入参数-v，等价于options.verbose=true
135     parser.add_option('-v', '--verbose', action='store_true', dest='verbose'    , help='Show debug/log output')
# 出现传入参数--urls, 等价于options.urls为--urls后紧跟的参数
# metavar有助于提醒用户，该命令行参数所期待的参数，如 metavar="mode"（找了很久都没找到metavar的意思，这个是从别的文章扒下来的），在传入help参数后此行的提示就应该为：#         --urls=FILE        Read     URLs from the specified file
136     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read     URLs from the specified file')
    # 设置parse的默认值
140     parser.set_defaults(verbose=False, display_errors=False)
    # 解析args
142     (options, args) = parser.parse_args(sys.argv) 
# 解析完args后，接下来就是对args中的各个参数的生效设定以及合法性校验
# 首先是verbose，如果传入参数带有--verbose的话，那么创建一个StreamHandler，设置其级别为DEBUG，这样运行程序的日志就都会出现在console上了，方便用于调试
143     if options.verbose:
144         console = logging.StreamHandler()
145         console.setLevel(logging.DEBUG)
146         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
147         console.setFormatter(formatter)
148         log.addHandler(console)
149         log.info('turning on verbose logging mode') 
# 接下来是display_errors，设置显示错误标志位
151     if options.display_errors:
152         log.info('turning display of errors ON')
153         display_errors = True 
# 再下来是urls，判断传入的urls参数是否为文件，是：打印日志，否：程序退出
155     if options.urls:
156         if os.path.isfile(options.urls):
157             urls_txt = options.urls
158             log.info('using %s as urls.txt' % options.urls)159         else:
160             log.error('%s is not a file' % options.urls)
161             print 'Error: %s is not a file' % options.urls
162             sys.exit(1) 
# 最后是hooks，与urls一致
164     if options.hooks:
165         if os.path.isfile(options.hooks):
166             hooks_py = options.hooks
167             log.info('using %s as hooks.py' % options.hooks)
168         else:
169             log.error('%s is not a file' % options.hooks)
170             print 'Error: %s is not a file' % options.hooks
171             sys.exit(1) 
# 这段程序显而易见，偷懒不说了
173     # Created all needed folders
174     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
175         if not os.path.isdir(needed_dir):
176             os.makedirs(needed_dir) 
# 以下这段也不说了，值得一说的是shutil，又学到个新模块，shutil.copy(srt, dst)等价于sh中的cp srt dst，而shutil.copy2(srt, dst)等价于cp -p srt dst，拷贝的不仅是文件，还包括文件的属性（ownership，mode，timestamps ==）
178     # Check for required files
179     if not os.path.isfile(urls_txt):···
192         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
193             shutil.copy(urls_txt_example, urls_txt_fn)
194         if not options.hooks and os.path.exists(hooks_py_example) and not os.pat    h.exists(hooks_py_fn):
195             shutil.copy(hooks_py_example, hooks_py_fn) 
# 再往下，hooks_py其实里面只需要定义一个过滤器函数filter，可以利用正则式从结果中筛选需要的信息
206     if os.path.exists(hooks_py):
207         log.info('using hooks.py from %s' % hooks_py)# 通过imp.load_source，将hooks_py导入工程，使得接下来能够使用hooks_py中的方法
208         hooks = imp.load_source('hooks', hooks_py)
# 判断filter是否为hooks模块的属性
209         if hasattr(hooks, 'filter'):
210             log.info('found and enabled filter function from hooks.py')
211             filter = hooks.filter
212         else:
213             log.warning('hooks.py has no filter function - ignoring')# lambda在此等价于def filter(x, y):return y，在这种情况下，filter就等于传入的y参数
214             filter = lambda x, y: y
215     else:
216         log.info('not using hooks.py (file not found)')
217         filter = lambda x, y: y 
# 这段是处理文件名的，将缓存文件夹中的文件名转为sha1加密后的密文，为了防止混淆？
# 用到了hashlib，hashlib.new('sha1')也可以等价于hashlib.sha1()，试了下，两种功能一样，应该是由于python的版本不同造成的差异吧，之后的处理都一样，update，digest，只不过用了hexdigest，用十六进制表示加密密文
219     for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswi    th('#') or x.strip()=='')):
220         log.info('processing URL: %s' % url)
221         if have_hashlib:
222             sha_hash = hashlib.new('sha1')
223             sha_hash.update(url)
224         else:
225             sha_hash = sha.new(url)
226         filename = os.path.join(cache_dir, sha_hash.hexdigest()) 
# 终于出现try了，urlwatch真正干活的东东就只两个，urllib2.urlopen和difflib.unified_diff，urllib这个东西比较庞大，以后再研究。
# 程序调用了Request于urlopen去获取页面，然后通过difflib.unified_diff去比较当前获取页面的内容与缓存文件的内容是否不同，风格类似sh中的diff
227         try:
228             request = urllib2.Request(url, None, headers)
229             data = filter(url, urllib2.urlopen(request).read())
230             if os.path.exists(filename):
231                 log.info('%s exists - creating unified diff' % filename)
232                 old_data = open(filename).read()
233                 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data    .splitlines(1)))
234                 if len(diff) > 0:
235                     log.info('%s has changed - adding diff' % url)
236                     details += foutput('changed', url, diff, summary)
237                 else:
238                     log.info('%s has not changed' % url)
239             else:
240                 log.info('%s does not exist - url is considered "new"' % filenam    e)
241                 details += foutput('new', url, None, summary)
242             log.info('writing current content of %s to %s' % (url, filename))243             open(filename, 'w').write(data)
244         except urllib2.HTTPError, error:
245             log.error('got HTTPError while loading url: %s' % error)
246             if display_errors:
247                 details += foutput('error', url, error, summary)
248         except urllib2.URLError, error:
249             log.error('got URLError while loading url: %s' % error)
250             if display_errors:
251                 details += foutput('error', url, error, summary) 
# 接下来是连续两个except捕捉异常
244         except urllib2.HTTPError, error:
248         except urllib2.URLError, error:
 
# 最后判断条件并输出结果
257     if len(summary) > 1:
268     if len(details) > 1:

--OVER，urlwatch比较简单，尽管我初学但还是看懂了，继续学习

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。