python seek thread 超大日志数据分析

葫芦的运维日志

下一篇 搜索 上一篇

2018/09/21 10:52


 

#!/usr/bin/env python
# -*- coding: utf-8 -*-

///
./flowdata.log
2017-02-02 15:29:19,390 [views:111:ebitpost] [INFO]- ebitapi: http://218.85.118.8:8000/api/user/query, ebit response: src_ip: 110.86.101.119:63688, content: {"data":{"basic_rate_down":20480,"basic_rate_up":2048,"dial_acct":"fj::059391534153","max_linerate_down":102400,"max_linerate_up":102400},"message":"提速判断成功","result":0}
///
///
./ipdb_cn.txt
1.1.1.0  中国 广东 深圳 
1.1.2.0  中国 广东 深圳
...
233.233.2.0  中国 新疆 乌鲁木齐
///
import re,heapq,threading
from collections import Counter
from multiprocessing import Pool
dic={}
def readconfig():
    with open('./ipdb_cn.txt',mode='r') as f:
        for i in f:
            nn=i.split()
            tn= nn[2].decode('utf-8')
            if dic.has_key(tn):
                dic[tn].add('.'.join(nn[0].split('.')[:-1]))
            else:
                dic[tn]=set()
                dic[tn].add('.'.join(nn[0].split('.')[:-1]))
t=threading.Thread(target=readconfig)
t.start()
tf=open('./flowdata.log','r')
tf.seek(0,2)
total=tf.tell()
def run(start,end):
    with open('./flowdata.log','r') as f:
        s=set()
        regex=re.compile(r'_ip:\s?([0-9]+(?:\.[0-9]+){3}')
        ad=s.add
        tel=f.tell
        fd=re.findall
        f.seek(start,0)
        for i in f:
            l=fd(regex,i)
            if len(l):
                ad(l[0])
            if  tel()>end:
                return s
        return s
p=Pool(4)
results=[]
for i in range(12):
    result=p.apply_async(run,args=(i*total/12,(i+1)*total/12))
    results.append(result)
p.close()
p.join()
t.join()
filset=set()
for result in results:
    filset|=result.get()
sumfil=len(filset)
filist=list(filset)
def refn(start,end):
    return [k for i in filist[start:end] for k in dic if i[:i.rindex('.')] in dic[k]]
p=Pool(4)
results=[]
for i in range(8):
    result=p.apply_async(refn,args=(i*sumfil/8,(i+1)*sumfil/8))
    results.append(result)
p.close()
p.join()
fn=[]
for result in results:
    fn+=result.get()
fdic=Counter(fn)
ret=[{'n':k,'v':fdic[k]/float(sumfil)*100} for k in fdic]
sortl=heapq.nlargest(len(ret),ret,key=lambda s:s['v'])
for i in sortl:
    print i['n'] + '   ' + '%.2f' % round(i['v'],2)+'%'

 

葫芦的运维日志

上一篇 搜索 下一篇
© 冰糖葫芦甜(bthlt.com) 2019 王梓 赞助联系方式 陕ICP备17005322号