HOME/Articles/

mysql example basespider scrapy js api utils date parse (snippet)

Article Outline

Python mysql example 'basespider scrapy js api utils date parse'

Functions in program:

  • def date_unit(unit):
  • def date_scale(dt, scale='MM'):
  • def _parse(x, now=None):
  • def tz_offset(tz):
  • def parse_date(x, fmt='auto', tz='+08:00', err=None):

Modules used in program:

  • import re

python basespider scrapy js api utils date parse

Python mysql example: basespider scrapy js api utils date parse

# -*- coding: utf-8 -*-
# ! /usr/bin/env python
# !/usr/bin/env python
# -*- coding: utf-8 -*-

from datetime import datetime, timedelta
from collections import OrderedDict
import re

__all__ = ['parse_date', 'tz_offset']


def parse_date(x, fmt='auto', tz='+08:00', err=None):
    """
    Parse datetime `x` with format `fmt` and timezone `tz`.
    Return datetime in UTC
    'tz' 支持类型为('+00:00','cst','utc')时间区域类型

    :param x: datetime string
    :type x: str
    :param fmt: datetime format
    :type fmt: str
    :param tz: timezone
    :type fmt: str
    """
    try:

        x = unicode(x)
        fmt = unicode(fmt)

        utcnow = datetime.utcnow()
        offset = tz_offset(tz)
        now = utcnow + offset

        if fmt == 'auto':
            date = _parse(x, now)
        elif fmt in ['epoch', 'unix']:
            date = datetime.utcfromtimestamp(int(x))
            offset = timedelta(0)
        else:
            date = datetime.strptime(x.encode('utf-8'), fmt.encode('utf-8'))

        date = (date + (offset - timedelta(hours=8)))
        return date

    except:
        if err:
            raise
        return datetime.utcfromtimestamp(0)


# 转换对应时差时间格式
def tz_offset(tz):
    tz = tz.lower().strip()
    if tz == 'cst':
        offset = timedelta(hours=8)
    elif tz == 'utc':
        offset = timedelta()
    else:
        res = re.search(r'(?P<F>[-+])(?P<HH>\d{2}):?(?P<MM>\d{2})', tz).groupdict()
        offset = timedelta(
            hours=int(res['HH']),
            minutes=int(res['MM'])
        ) * (1 if res.get('F', '+') == '+' else -1)
    return offset


def _parse(x, now=None):
    # 当前时间
    # now = now or datetime.utcnow()
    # 秒
    now_SS = date_scale(now, 'SS')
    # 分
    now_MM = date_scale(now, 'MM')
    # 小时
    now_HH = date_scale(now, 'HH')
    # 天
    now_dd = date_scale(now, 'dd')
    # 月
    now_mm = date_scale(now, 'mm')
    # 年
    now_YY = date_scale(now, 'YY')
    # 预处理
    x = re.sub(u'刚刚|刚才', now_MM.strftime('%Y-%m-%d %H:%M:%S'), x)
    # x = re.sub(u'刚刚|刚才', now_MM.strftime('%F %T'), x)
    x = re.sub(u'几', u'0', x)
    x = re.sub(ur'(?<=[\d半前昨今明后])(天|号)', u'日', x)
    # 获取一天时间
    one_dd = date_unit('dd')
    rdays = {
        u'前日': now_dd - one_dd * 2,
        u'昨日': now_dd - one_dd * 1,
        u'今日': now_dd,
        u'明日': now_dd + one_dd * 1,
        u'后日': now_dd + one_dd * 2,
    }
    # 将x值转换成rdays对应时间格式
    for k, v in rdays.iteritems():
        x = x.replace(k, v.strftime(' %Y-%m-%d '))
        # x = x.replace(k, v.strftime(' %F '))

    x = re.sub(ur'(?<=\d)[/.](?=\d)', u'-', x)
    x = re.sub(ur'[^-:\s\d前后半秒分时日周月年]', u'', x)
    x = re.sub(ur'(?<=\d)\s+(?!\d)', u'', x)
    x = re.sub(ur'(?<!\d)\s+(?=\d)', u'', x)
    x = re.sub(ur'(?<!\d)\s+(?!\d)', u'', x)
    x = re.sub(ur'(?<!年)(?=(^(1[0-2]|\d+))月(\d+)日)', u' %d年' % now.year, x)
    x = re.sub(ur'(\d+)年(\d+)月(\d+)日', ur'\g<1>-\g<2>-\g<3> ', x)
    x = x.strip()

    if '-' in x or ':' in x:

        parts = {}
        pats = [
            ur'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})',
            ur'(?P<hour>\d{1,2}):(?P<minute>\d{1,2})(:(?P<second>\d{1,2}))?',
        ]
        for p in pats:
            m = re.search(p, x)
            if m:
                parts.update(m.groupdict())

        for k, v in parts.items():
            if v == None:
                del parts[k]
            else:
                parts[k] = int(v)

        if parts:
            parts['year'] = parts.get('year', now.year)
            parts['month'] = parts.get('month', now.month)
            parts['day'] = parts.get('day', now.day)

            return datetime(**parts)

    if u'半' in x:

        halves = {
            u'半分': u'30秒',
            u'半时': u'30分',
            u'半日': u'12时',
            u'半周': u'84时',
            u'半月': u'15日',
            u'半年': u'6月',
        }

        for k, v in halves.iteritems():
            x = re.sub(k, v, x)

    us = {
        u'年': 'YY',
        u'月': 'mm',
        u'周': 'ww',
        u'日': 'dd',
        u'时': 'HH',
        u'分': 'MM',
        u'秒': 'SS',
    }
    m = re.search(ur'(?P<num>\d+)(?P<unit>%s)(?P<flag>前|后)' % (u'|'.join(us.keys())), x)
    if m:
        d = m.groupdict()
        k = d['unit']
        f = -1 if d['flag'] == u'前' else 1
        v = f * int(d['num'])
        u = date_unit(us[k])
        s = 'dd' if us[k] == 'ww' else us[k]
        date = date_scale(now + u * v, s)
        return date
    for i in re.findall(ur'(?<!\d)(\d{8}|\d{10}|\d{13})(?!\d)', x):
        k = len(i)
        v = int(i)
        if k == 8:
            date = datetime.strptime(i, '%Y%m%d')
        elif k == 10:
            date = datetime.fromtimestamp(v)
        elif k == 13:
            date = datetime.fromtimestamp(v / 1000)
        else:
            raise Exception()

        return date

    raise Exception()


def date_scale(dt, scale='MM'):
    scales = OrderedDict([
        ('MS', 'microsecond'),  # 微秒
        ('SS', 'second'),  # 秒
        ('MM', 'minute'),  # 分钟
        ('HH', 'hour'),  # 小时
        ('dd', 'day'),  # 天
        ('mm', 'month'),  # 月
        ('YY', 'year'),  # 年
    ])

    assert scale in scales

    for k, v in scales.iteritems():
        if k == scale:
            return dt
        dt = dt.replace(**{v: 1 if k in ['dd', 'mm'] else 0})

    raise Exception()


_units = dict(
    SS=timedelta(seconds=1),
    MM=timedelta(minutes=1),
    HH=timedelta(hours=1),
    dd=timedelta(days=1),
    ww=timedelta(days=7),
    mm=timedelta(days=30),
    YY=timedelta(days=365)
)


def date_unit(unit):
    return _units[unit]


if __name__ == '__main__':
    xs = [
        u'2014-01-01',
        u'2014/11/01',
        u'2014.12.01',

        u'01:23',
        u'01:23:45',
        u'01 : 23 : 45',

        u'2014-01-01 01:23',
        u'2014-01-01 01:23:45',

        u'今天',
        u'昨天',
        u'前天',

        u'刚刚',
        u'刚才',
        u'几秒前',
        u'5秒前',

        u'5分钟前',
        u'5小时前',
        u'5天前',
        u'5周前',
        u'5年前',

        u'5分钟后',
        u'5小时后',
        u'5天后',
        u'5周后',
        u'5年后',

        u'半分钟前',
        u'半小时前',
        u'半天前',
        u'半周前',
        u'半月前',
        u'半年前',

        u'20140101',
        u'20140101 012345',

        u'1400641135',
        u'1400641135000',

        u'4月19号的预售,今天都5月21号了',
        u'刚才 你去哪了?',
        u'2014 年 1 月 1 日',
    ]
    # print(datetime.utcfromtimestamp(0))
    # for i, x in enumerate(xs,1):
    #     print('IN [%d]: %s' % (i, x))
    #     y = parse_date(x, 'auto', 'cst', True)
    #     print('OUT[%d]: %s [%s]' % (i, y, type(y).__name__))
    #     print
    #
    # print('>>>', parse_date('01012014080000', '%m%d%Y%H%M%S', '+08:00'))
    print('>>>', parse_date('2014-01-01 01:23', tz='cst'))