preface:工作中使用python进行处理常会遇到各种问题及技巧,为此汇总。python功能太多,记住常用即可。

环境:python3.7及以上、mac

python——pip、conda、ipython

  • 常用1:创建虚拟环境(从一个爸爸创造多个儿子,各个儿子之间环境不影响,同一个服务器上自己的python环境不被其他人干扰,自己的多个人可采用不同版本tensorflow包等等)
which conda:查看当下是哪个conda
conda env list:看看有哪些虚拟环境
conda create -n sftf python==3.7.6:创建虚拟环境
conda activate sftf:启动虚拟环境
conda deactivate:关闭虚拟环境
  • 常用2:pip安装包
  • pip install -r requirements.txt:批量安装某些包,包名每行一个,放在requirements.txt文件里面
  • pip install xxxxx -i https://pypi.tuna.tsinghua.edu.cn/simple:采用清华源来安装。其他镜像:
  • 实在不行,先下载,再塞到site-packages文件下(/Users/shifeng/anaconda3/lib/python3.7/site-packages)
  • 注意:mac、linux不同环境下的site-packages可能不能通用
  • 常用3:ipython交互式调试各种问题

python——datetime、time

  • 常用1:取当前时间;时间、string、datetime三者转换;距今多少日等
  • 代码
# coding=utf-8
# from utils import time_utils.py #该文件命名time_utils.py,放到utils文件夹下,调用即可
from datetime import date, timedelta, datetime
import time

def get_today(timeFormat='%Y%m%d'):
    return(date.today()).strftime(timeFormat)

def get_yesterday(timeFormat='%Y%m%d'):
    return (date.today() - timedelta(1)).strftime(timeFormat)

def get_current(timeFormat='%Y%m%d'):
    local_time = time.localtime()
    timeString = time.strftime(timeFormat, local_time)
    return timeString

def get_date(diff_days=0, diff_hours=0, day_format='%Y%m%d', current_date=None):
    date={}
    if current_date is None:
        timestamp = time.time() - (diff_days * 24 + diff_hours) * 3600
        date['day'], date['hour'] = time.strftime(day_format + ' %H:%M', time.localtime(timestamp)).split()
        return date

    day = current_date.get('day', None)
    hour = current_date.get('hour', None)
    if hour is None:
        timestamp = time.mktime(time.strptime(day, day_format)) - diff_days * 24 * 3600
        date['day'] = time.strftime(day_format, time.localtime(timestamp))
        return date
    else:
        timestamp = time.mktime(time.strptime('%s %s' % (day, hour), day_format + ' %H:%M')) - \
            (diff_days * 24 + diff_hours) * 3600
        date['day'], date['hour'] = time.strftime(day_format + ' %H:%M', time.localtime(timestamp)).split()
        return date

def get_last_half_hour(timeFormat='%H-%M-%S'):
    localtime = time.localtime()
    minute = localtime.tm_min

    if minute > 30:
        minute = 30
    else:
        minute = 0

    last_hour_time = str(localtime.tm_year) + convert_digit_to_str(localtime.tm_mon) + convert_digit_to_str(localtime.tm_mday) + ' '\
        + convert_digit_to_str(localtime.tm_hour) + '-' + convert_digit_to_str(minute) + '-' + '00'
    timestamp = time.mktime(time.strptime(last_hour_time, '%Y%m%d %H-%M-%S'))

    return time.strftime(timeFormat, time.localtime(timestamp))

def convert_digit_to_str(digit):
    if digit < 10:
        return str(0) + str(digit)
    return str(digit)

def get_before_last_half_hour(timeFormat='%H-%M-%S'):
    localtime = time.localtime()
    minute = localtime.tm_min
    if minute > 30:
        minute = 30
    else:
        minute = 0

    last_hour_time = str(localtime.tm_year) + convert_digit_to_str(localtime.tm_mon) + convert_digit_to_str(localtime.tm_mday) + ' '\
        + convert_digit_to_str(localtime.tm_hour) + '-' + convert_digit_to_str(minute) + '-' + '00'
    timestamp = time.mktime(time.strptime(last_hour_time, '%Y%m%d %H-%M-%S')) - (30 * 60)
    return time.strftime(timeFormat, time.localtime(timestamp))

def get_week_before(day_time, timeFormat='%Y%m%d'):
    timestamp = time.mktime(time.strptime(day_time, timeFormat))
    timestamp -= 7*24*3600
    return time.strftime(timeFormat, time.localtime(timestamp))


def get_day_before(day_time, num_day, timeFormat='%Y%m%d'):
    timestamp = time.mktime(time.strptime(day_time, timeFormat))
    timestamp -= num_day*24*3600
    return time.strftime(timeFormat, time.localtime(timestamp))


def get_minute_before(timeStr, minute, timeFormat='%H-%M'):
    timestamp = time.mktime(time.strptime(timeStr, timeFormat))
    timestamp -= minute * 60
    return time.strftime(timeFormat, time.localtime(timestamp))


def convert_time(timeStr, input_format, out_format):
    try:
        timestamp = time.mktime(time.strptime(timeStr, input_format))
    except OverflowError:
        print("timeStr is: ", timeStr, "out of range")
        return None
    return time.strftime(out_format, time.localtime(timestamp))


def get_diff_days(timeStr1, timeStr2, timeFormat="%Y%m%d"):
    time1 = datetime.strptime(timeStr1, timeFormat).replace(hour=0, minute=0, second=0, microsecond=0)
    time2 = datetime.strptime(timeStr2, timeFormat).replace(hour=0, minute=0, second=0, microsecond=0)
    return (time1 - time2).days

def get_today_before(days, timeFormat='%Y%m%d'):
    return (date.today() - timedelta(days)).strftime(timeFormat)

def get_now_str():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def get_now_unix():
    return  time.mktime(datetime.now().timetuple())


'''
四种转换:
timeStr <-> timeUnix互转
datetime <-> timeStr互转
'''
def timeStr_to_timeUnix(timeStr="2020-12-26 16:10:10", timeFormat='%Y-%m-%d %H:%M:%S'):
    return  time.mktime(time.strptime(timeStr, timeFormat))

def timeUnix_to_timeStr(timeUnix=1608969619, timeFormat='%Y-%m-%d %H:%M:%S'):
    return time.strftime(timeFormat, time.localtime(timeUnix))

def dateTime_to_timeStr(dt, timeFormat='%Y-%m-%d %H:%M:%S'):
    return dt.strptime(timeFormat)

def timeStr_to_dateTime(timeStr="2020-12-26 16:10:10"):
    return datetime.strptime(timeStr, "%Y-%m-%d %H:%M:%S")


if __name__ == '__main__':
    print(get_yesterday())
    print(get_diff_days("20190801", "20190725", "%Y%m%d"))
    print(get_timestamp("2019-08-12 16:45:20", timeFormat="%Y-%m-%d %H:%M:%S"))
    print(get_timestamp("2019-08-12 16:45:21", timeFormat="%Y-%m-%d %H:%M:%S"))
    today = get_today(timeFormat="%Y-%m-%d %H:%M:%S")
    print(today)
    print(get_diff_days(today, "2019-09-01 10:04:14", timeFormat="%Y-%m-%d %H:%M:%S"))
    print(get_today_before(0))
    print(get_today_before(1))
    print(get_today_before(-1, timeFormat="%Y-%m-%d %H:%M:%S"))
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(now)
    print(get_minute_before(now, 5, timeFormat="%Y-%m-%d %H:%M:%S"))

python——redis、mysql

  • 常用1:读写mysql
  • 读取mysql:准备conn、cursor、sql;执行;关闭
  • 写入mysql:准备conn、cursor、sql;执行;conn.commit()提交;关闭
  • 调用:
'''
#xxx_conf.json文件:
{
    "xxx_db": {
      "host": "xxx",
      "user": "xxx",
      "psd": "xxx",
      "db":"xxxx"
    },
     "yyy_db": {
      "host": "xxx",
      "user": "xxx",
      "psd": "xx",
      "db":"xx"
    }
}
'''
'''
#utils/conf_parser.py文件:

# coding=utf8

import json


def parse_conf(f_path):
    return Config(config_file=f_path)


class Config(object):
    """Config load from json file
    """

    def __init__(self, config=None, config_file=None):
        if config_file:
            with open(config_file, 'r') as fin:
                config = json.load(fin)

        self.dict = config
        if config:
            self._update(config)

    def __getitem__(self, key):
        return self.dict[key]

    def __contains__(self, item):
        return item in self.dict

    def items(self):
        return self.dict.items()

    def add(self, key, value):
        """Add key value pair
        """
        self.__dict__[key] = value

    def _update(self, config):
        if not isinstance(config, dict):
            return

        for key in config:
            if isinstance(config[key], dict):
                config[key] = Config(config[key])

            if isinstance(config[key], list):
                config[key] = [Config(x) if isinstance(x, dict) else x for x in
                               config[key]]

        self.__dict__.update(config)
'''

from utils import conf_parser
conf = conf_parser.parse_conf("conf/xxx_conf.json")

def selectDataFromDB(beginDate, endDate):
    sql = f"select distinct xx from pic where xx between '{beginDate}' and '{endDate}'"
    df  = mysql_data_getter.BaseDataGetter.get_from_conf(conf.xxx_db, sql)
    return df

def insertData2mysql(xxxx):
    if len(xxx)==0:return
    insertSql = "xxx"
    mysql_data_getter.BaseDataGetter.insert_from_conf(conf.yyy_db, insertSql)
  • 代码:mysql_data_getter.py
# coding=utf8

import os
import pandas as pd
import pymysql.cursors
import re

class BaseDataGetter(object):
    """
    基础数据拉取类
    """
    @classmethod
    def get_db_table_list(cls, conn):
        """
        功能:获取db下所有表,表存在,才读取sql
        """
        con = conn.cursor()
        sql = "show tables;"
        con.execute(sql)
        tables = [con.fetchall()]
        table_list = re.findall('(\'.*?\')',str(tables))
        table_list = [re.sub("'",'',each) for each in table_list]
        con.close()
        return table_list

    @classmethod
    def get_table_name(cls, sql):
        if 'where' in sql.lower():
            tmp = sql.lower().split('where')[0].split('from')
            if len(tmp)==2:
                return tmp[1].strip()
            else:
                print(tmp)
                raise "sql error1"
        elif 'group' in sql.lower():
            tmp = sql.lower().split('group by')[0].split('from')
            if len(tmp)==2:
                return tmp[1].strip()
            else:
                print(tmp)
                raise "sql error2"
        else:
            tmp = sql.lower().split('from')
            if len(tmp)==2:
                return tmp[1].strip()
            else:
                print(tmp)
                raise "sql error3"

    @classmethod
    def _dump_sql_data(cls, sqls, conn):
        """
        功能:将sql查询结构保存到本地文件
        :param sqls:  {file_name:sql} , sql 是要执行的sql查询, file_name 是待查询的语句保存到文件
        :param conn:  具体的数据库连接
        :return:
        """
        try:
            table_list = cls.get_db_table_list(conn)

            for file_name, sql in sqls.items():
                print(sql)
                table_name = cls.get_table_name(sql)
                if 'join' not in table_name and table_name not in table_list:
                    print("table_name not in table_list:", table_name)
                    print("table_list size:", len(table_list))
                    continue

                df = pd.read_sql(sql, conn)
                if os.path.exists(file_name):
                    df.to_csv(file_name, mode="a", header=False)
                else:
                    df.to_csv(file_name)
        finally:
            conn.close()

    @classmethod
    def _init_connect(cls, host, port, uname, pwd, db):
        """
        用来构建mysql connect 连接对象
        :param host:
        :param port: 端口号, 必须是int
        :param uname:
        :param pwd:
        :param db:
        :return: connect 对象
        """
        if isinstance(port, str):
            port = int(port)
        conn = pymysql.connect(host=host, port=port, user=uname,
                               password=pwd, db=db, charset="utf8",
                               cursorclass=pymysql.cursors.DictCursor)
        return conn

    @classmethod
    def download(cls, host, port, uname, pwd, db, sqls):
        conn = cls._init_connect(host=host, port=int(port), uname=uname, pwd=pwd, db=db)
        cls._dump_sql_data(sqls, conn)

    @classmethod
    def download_from_conf(cls, conf, sqls):
        port = getattr(conf, "port", 3306)
        conn = cls._init_connect(host=conf.host, port=int(port),
                                 uname=conf.user, pwd=conf.psd, db=conf.db)
        cls._dump_sql_data(sqls, conn)

    @classmethod
    def get_from_conf(cls, conf, sql):
        port = getattr(conf, "port", 3306)
        conn = cls._init_connect(host=conf.host, port=int(port),
                                 uname=conf.user, pwd=conf.psd, db=conf.db)
        return cls.get_sql_data(sql, conn)

    @classmethod
    def get_sql_data(cls, sql, conn):
        try:
            table_list = cls.get_db_table_list(conn)
            table_name = cls.get_table_name(sql)
            print(sql)
            if 'join' not in table_name and table_name not in table_list:
                print("table_name not in table_list:", table_name)
                print("table_list size:", len(table_list))
                return
            df = pd.read_sql(sql, conn)
            return df

        finally:
            conn.close()

    @classmethod
    def insert_from_conf(cls, conf, sql):
        port = getattr(conf, "port", 3306)
        conn = cls._init_connect(host=conf.host, port=int(port),
                                 uname=conf.user, pwd=conf.psd, db=conf.db)
        cls.insert_sql_data(sql, conn)

    @classmethod
    def insert_sql_data(cls, sql, conn):
        try:
            con = conn.cursor()
            con.execute(sql)
            print('insert is ok...',)
            conn.commit()
        finally:
            con.close()
            conn.close()
  • 常用2:读写redis
  • 调用:
  • data_saver   = RedisSaver(conf.redis_info)
  • redis_client = data_saver.redis
  • 写入:data_saver.zadd_list_batchly_selfExpTime([[key1, {item1:score1, item2:score2}], [key2, {xxx}]], 7*6*3600))
  • 读取:redis_client.zrange(rt_key, 0, max(0, rt_end-1), withscores=True)
  • 注意:无论啥样的数据,写入redis,都要设置过期时间,1天、7天、1个月等,切记不可永久,不可只增不减
  • 代码:redis_saver.py
# coding=utf8
from redis import StrictRedis

class RedisSaver(object):
    def __init__(self, conf):
        host = conf.redis_pure_info.host
        port = conf.redis_pure_info.port
        db   = conf.redis_pure_info.db
        pwd  = conf.redis_pure_info.pwd
        self.redis = StrictRedis(host=host, port=port, db=db, password=pwd)
        self.pipe = self.redis.pipeline()
        self.exp_time = 7 * 24 * 3600

    def add_sorted_set(self, name, item_score_dict):
        self.redis.delete(name)
        self.redis.zadd(name, item_score_dict)

    def add_list(self, name, item_list):
        # self.redis.delete(name)
        self.redis.lpush(name, *item_list)

    def sadd_list(self, name, item_list):
        self.redis.delete(name)
        self.redis.sadd(name, *item_list)

    def sadd_list_batchly(self, name_items_pairs):
        for name, item_list in name_items_pairs:
            self.pipe.delete(name)
            self.pipe.sadd(name, *item_list)
            self.pipe.expire(name, self.exp_time)
        self.pipe.execute()

    def zadd_list_batchly_selfExpTime(self, name_item_score_dict, exp_time=1*6*3600):
        for name, item_scores in name_item_score_dict:
            self.pipe.delete(name)
            self.pipe.zadd(name, item_scores)
            self.pipe.expire(name, exp_time)
        self.pipe.execute()

python——pandas、numpy、scipy

python——re

python——os、sys、shell

python——emoji

  • 常用:过滤emoji,总是出现各种emoji不能被完全过滤。
  • 方法:
  • 使用正则过滤、使用emoji包解析emoji符号配合正则过滤掉
  • 代码:
import re
import emoji

# 方法1:并不能完全过滤,只能过滤部分emoji
def filterEmoji(desstr,restr=' '):
    # 过滤emoji
    try:
        co = re.compile(u'[\U00010000-\U0010ffff]')
    except re.error:
        co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
    return co.sub(restr, desstr)

# 方法2:使用eomji包,将🤓解析为“:nerd_face:”,使用正则过滤掉。
#(需要注意文本中是否包含了":xxx:",xxx也会被过滤掉)
def filterEmojiByEmojiAndRe(s, restr=" "):
    re.sub('(:.+?:)', restr, emoji.demojize(s))  

'''
🤓符号:
print(text.encode('unicode-escape').decode('ASCII')) 
output: \U0001f188\ue513\ue220\ue21c
print('\ud83e\udd13') #出错
'''
s = '🤓无限大地'
print(filterEmoji(s))

print(filterEmojiByEmojiAndRe(s))

python——gensim、nltk、spacy

  • nltk常用:对英文进行分句(平时做机器翻译时,需要对段落处理,拆分为若干句子)
  • nltk数据包:
  • nltk.download()太慢。
  • 百度云盘直接下载:https://pan.baidu.com/s/17ZgkoQeMosWwHNlUvXvTdw 密码:lxmh
  • 解压放到/Users/shifeng/nltk_data。即可
  • 代码:
#使用nltk对英文段落分句子:
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize('fight among communists and anarchists (i.e. at a series of events named May Mr. Days).’)   #即可

#对句子分词
import nltk
nltk.word_tokenize("And now for something completely different.")

python——pytorch、tensorflow

python——flask、apscheduler

  • 常用1:需要将任务打包成服务,使用flask封装很方便,暴露一个接口即可。
  • 常用2:对任务进行定时调度。在flask框架中,每日固定某个点更新模型。
  • 策略:分两种BackgroundScheduler、BlockingScheduler
  • 前者不阻塞,在主程序(如flask服务)中,额外再开一个定时任务,定时任务结束,不影响主程序。
  • 后者阻塞,需要为主程序调用,定时任务结束,主程序结束。
  • 参考:https://zhuanlan.zhihu.com/p/74046287、:
  • flask代码:
  • request里的POST、GET请求问题
  • client调用的问题
  • apscheduler代码:
# flask里面进行定时任务例子:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.blocking import BlockingScheduler

from flask import Flask, jsonify, request
app=Flask(__name__)
print('start app is ok...')

@app.route('/')
def hello_world():
    return "hello word"

def flashModelAndData():
    print('xxx')

def saveUserDateHadRec():
    print('UUU')

sched = BackgroundScheduler(daemon=True)
sched.add_job(flashModelAndData, 'cron', hour=2, minute=1)
sched.add_job(saveUserDateHadRec, 'interval', minutes=30)
sched.start()

if __name__ == '__main__':
    app.run(config.HOST, config.PORT)

# 阻塞定时任务调用例子:
from datetime import datetime
import os
from apscheduler.schedulers.blocking import BlockingScheduler
 
def tick():
    print('Tick! The time is: %s' % datetime.now())
 
if __name__ == '__main__':
    scheduler = BlockingScheduler()
    scheduler.add_job(tick, 'cron', hour=9, minute=1)
    try:
        scheduler.start()
    except SystemExit:
        pass

python——warnings(等技巧相关)

  • 代码:
# 代码里面
import warnings
warnings.filterwarnings("ignore”)

# ipython -W ignore yourscript.py #启动ipython时