需求描述:

1.网站图片上传功能测试的时候可能或多或少会存在不少垃圾图片。
2.网站广告位长期替换
3.图片重复上传
4.定期清理

实现方法:

1.获取本地某目录下所有图片 (os)
2.获取数据库中某张表某个字段的图片名称 (re,BeautifulSoup)
3.取他们的差集

 

import re
import MySQLdb
from BeautifulSoup import BeautifulSoup
import os
import time
 
#Mysql config
 
MYSQL_HOST      = "127.0.0.1"
MYSQL_USER_NAME = "admin"
MYSQL_PWD       = "admin"
MYSQL_DB_NAME = "test"
MYSQL_DB_TABLE = "goods"
 
#Local files path
 
FILE_PATH = "图片路径" # connotation Image\\"
 
def _fileDir (filePath):
    files = []
    for i in os.listdir(filePath):
        if os.path.isfile(filePath + i):
            files.append(i)
    return files
 
 
 
 
def _createHTML (goodsDesc):
    return BeautifulSoup(goodsDesc)
 
def _getGoodsId ():
    conn = MySQLdb.connect(host=MYSQL_HOST,user=MYSQL_USER_NAME,passwd=MYSQL_PWD,db=MYSQL_DB_NAME)
    c = conn.cursor()
    sql = "SELECT goods_id FROM " + MYSQL_DB_TABLE
    c.execute(sql)
    goods_id = c.fetchall() # return tuple
    c.close()
    return goods_id
 
 
def _getGoodsDesc (gid):
    #time.sleep(1)
    conn = MySQLdb.connect(host=MYSQL_HOST,user=MYSQL_USER_NAME,passwd=MYSQL_PWD,db=MYSQL_DB_NAME)
    c = conn.cursor()
    sql = "SELECT goods_desc FROM " + MYSQL_DB_TABLE + " WHERE goods_id = " + str(gid) + " LIMIT 1"
    c.execute(sql)
    goods_desc = c.fetchone() # return tuple
    c.close()
    return goods_desc[0]
 
 
 
 
def getOnlineImages (goodsDescDom):
    p_w_picpaths = []
    pattern = re.compile(r'/p_w_picpaths/upload/Image/(.+)') 
    imgSrc = goodsDescDom.findAll('img')
    for i in imgSrc:
        if pattern.search(i['src']):
            p_w_picpaths.append(str(pattern.search(i['src']).group(1)))
    return p_w_picpaths
 
 
if __name__  == "__main__":
    files = _fileDir(FILE_PATH)
    goodsId = _getGoodsId()
    allOnlineImages = []
    for i in goodsId:
        print i[0]
        allOnlineImages += getOnlineImages(_createHTML(_getGoodsDesc(i[0])))
        #print allOnlineImages
 
    differentSet = set.union(set.difference(set(files)-set(allOnlineImages)),set.difference(set(allOnlineImages)-set(files)))
 
    print len(differentSet)
 
    #f = open('files.txt',"a")
    for i in differentSet :
        if os.path.isfile(FILE_PATH+i):
            os.remove(FILE_PATH+i)
            print i
        #f.write(i+"\t")
    #f.close()