Python并发实践_03_并发实战之一
16S数据质控流程,一次下机lane包括很多的项目,每个项目有独立的合同号,一个项目可能包含16S或者ITS两种,通过一个完整的pipeline,将上游拆分好的数据全部整理成可以直接分析的数据。原本这个工作是通过并行的sge实现,是运行层面的并行,现在在程序层面实现并行处理,可以脱离sge系统工作。
import os
import sys
import re
import time
import collections
from multiprocessing import Process,JoinableQueue,Queue,cpu_count
from threading import Thread
from settings import primer,pandaseq_soft
from programs import * Result = collections.namedtuple("Result","compact sample_name HQ_fq") def parse_sam_barcode_file(sam_barcode_file):
for line in open(sam_barcode_file):
yield line.strip().split('\t') def proc(compact,sample_name,work_path,lib_method,data_type):
split_path = '%s/Split'%work_path
QC_path = '%s/QC'%work_path
compact_path = '%s/%s'%(QC_path,compact)
if not os.path.exists(compact_path):
os.makedirs(compact_path)
sample_path = '%s/%s'%(compact_path,sample_name)
if not os.path.exists(sample_path):
os.makedirs(sample_path)
original_path = '%s/%s/%s'%(split_path,compact,sample_name)
(read1,read2) = os.popen('ls %s/*'%original_path).read().strip().split('\n')
pandaseq_fq = '%s/pandaseq.fq'%sample_path
pandaseq_log = '%s/pandaseq.log'%sample_path
pandaseq(pandaseq_soft,read1,read2,pandaseq_fq,primer[lib_method][data_type]['forward'],primer[lib_method][data_type]['reverse'],pandaseq_log)
high_quality_fq = '%s/high_quality.fq'%sample_path
high_quality_log = '%s/high_quality.stat'%sample_path
QC(pandaseq_fq,high_quality_fq,high_quality_log,data_type)
return Result(compact,sample_name,high_quality_fq) def worker(work_path,jobs,results):
while True:
try:
compact,sample_name,lib_method,data_type = jobs.get()
try:
result = proc(compact,sample_name,work_path,lib_method,data_type)
sys.stderr.write( 'Process %s is finished doing with compact:%s sample_name:%s\n'%(os.getpid(),compact,sample_name) )
results.put(result)
except:
sys.stderr.write('Process %s is FIALED !!! %s/%s may be some problem!\n'%(os.getpid(),compact,sample_name))
jobs.put((compact,sample_name,lib_method,data_type))
sys.stderr.write('The job is repushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))
finally:
jobs.task_done() def add_jobs(work_path,sam_barcode_file_list,jobs):
job_num = 0
data_type_hash = {}
for todo,sam_barcode_file in enumerate(sam_barcode_file_list):
sam_barcode_file = sam_barcode_file.strip()
if not os.path.isfile(sam_barcode_file):
continue
lib_method = get_lib_method(sam_barcode_file)
if lib_method is None:
continue
print 'sam_barcode_file loading: %s ...... ok\n'%sam_barcode_file
for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):
print 'sam_barcode_file loading: %s ...... ok\n'%sam_barcode_file
for compact,sample_name,barcode_info,data_type in parse_sam_barcode_file(sam_barcode_file):
if not data_type_hash.has_key(compact):
data_type_hash[compact] = {}
if not data_type_hash[compact].has_key(data_type):
data_type_hash[compact][data_type] = []
data_type_hash[compact][data_type].append(sample_name)
jobs.put((compact,sample_name,lib_method,data_type))
job_num += 1
sys.stderr.write('The job is pushed into the queue,with compact:%s sample_name:%s\n'%(compact,sample_name))
sys.stderr.write('\n### All %s jobs have been pushed into the queue ###\n'%job_num)
return data_type_hash def create_processes(concurrency,jobs,work_path,results):
print '\nBegin create jobs with %s Process...\n'%concurrency
for _ in range(concurrency):
process = Process(target=worker,args=(work_path,jobs,results))
process.daemon = True
process.start() def main(work_path,sam_barcode_file_list):
global concurrency
split_path = '%s/Split'%work_path
QC_path = '%s/QC'%work_path
jobs = JoinableQueue()
results = Queue() canceled = False
data_type_hash = add_jobs(split_path,sam_barcode_file_list,jobs)
create_processes(concurrency,jobs,work_path,results)
try:
jobs.join()
except KeyboardInterrupt:
sys.stderr.write('cancelling ...\n')
canceled = True
finally:
job_num = 0
finished_hash = {}
while not results.empty():
result = results.get_nowait()
job_num += 1
if not finished_hash.has_key(result.compact):
finished_hash[result.compact] = []
finished_hash[result.compact].append(result.sample_name)
sys.stderr.write('all %s work finished!\n\n'%job_num)
log_out = open('%s/work.log'%QC_path,'w')
for compact,sample_list in finished_hash.iteritems():
for sample_name in sample_list:
log_out.write('%s\t%s has been finished\n'%(compact,sample_name))
log_out.close()
if canceled:
return False for compact in os.listdir(QC_path):
compact_dir = '%s/%s'%(QC_path,compact)
if not os.path.isdir(compact_dir):
continue
sys.stderr.write('Begin stat compact: %s\n'%compact)
reads_stat(compact_dir)
sys.stderr.write('All campact stat finished!\n\n') reads_stat_all(QC_path,split_path) merge_threads = set()
for compact,subitem in data_type_hash.iteritems():
compact_dir = '%s/%s'%(QC_path,compact)
for data_type,sample_list in subitem.iteritems():
merged_file = '%s/%s/%s.together.fna'%(QC_path,compact,data_type)
t = Thread(target=sampleMerge,args=(sample_list,data_type,compact_dir,merged_file))
merge_threads.add(t)
t.start()
while True:
if threading.activeCount() < concurrency:
break
for t in threading.enumerate():
if t in merge_threads:
t.join() sys.stderr.write('\n All pipeline is done ! \n') if __name__ == '__main__':
sys.argv.pop(0)
if len(sys.argv) < 1:
sys.stderr.write('Usage: python run_pipeline.py work_path [process_num] \n process_num default is cpu_count\n')
sys.exit()
work_path = sys.argv.pop(0)
work_path = os.path.abspath(work_path)
sys.stderr.write('Workdir is %s,pipeline begin\n'%work_path)
sam_barcode_file_list = os.popen('ls %s/Split/sam_barcode.*'%work_path).read().strip().split('\n')
if len(sys.argv) != 0:
concurrency = int(sys.argv.pop(0))
else:
concurrency = cpu_count() main(work_path,sam_barcode_file_list)
下面是一些辅助程序:
from __future__ import division
from threading import Thread,Lock
from multiprocessing import cpu_count
import threading
import sys
import os
import re
import types
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def fq_reads_num(fq_file):
wc_out = os.popen('wc -l %s'%fq_file).read().strip()
result = int(re.search('^(\d+)',wc_out).group(1)) / 4
return int(result) def Q_ave(self):
Q_sum = 0
for qlist in self.letter_annotations.itervalues():
for q in qlist:
Q_sum += q
Q_ave = Q_sum / len(self)
return Q_ave def QC(file,out_file,out_stat_file,data_type):
SeqRecord.Q_ave = Q_ave
out_stat = open(out_stat_file,'w')
out = open(out_file,'w') count = 0
high_count = 0
for record in SeqIO.parse(open(file),'fastq'):
count += 1
if record.Q_ave() < 20:
continue
if len(record) < 220 or len(record) > 500:
continue
out.write(record.format('fastq'))
high_count += 1
high_ratio = high_count / count
out_stat.write('%s\t%s\t%s\t%s\n'%(data_type,count,high_count,high_ratio)) class MyList(list):
def __str__(self):
out_str = ''
for item in self:
out_str += item
out_str += '\t'
return out_str.strip() def parse_stat(stat_file):
tabs = os.popen('cat %s'%stat_file).read().strip().split('\t')
yield tabs def parse_stat_files(compact_path):
for f in os.popen('ls %s/*/*.stat'%compact_path):
stat_file = f.strip()
sample_name = re.search('%s\/(\S+)\/high_quality\.stat'%compact_path,stat_file).group(1)
yield stat_file,sample_name def reads_stat(compact_path):
out = open('%s/reads_stat.xls'%compact_path,'w')
sample_reads = {}
out = open('%s/reads_stat.xls'%compact_path,'w')
sample_reads = {}
for stat_file,sample_name in parse_stat_files(compact_path):
for tabs in parse_stat(stat_file):
sample_reads[sample_name] = tabs out.write('sample_name\tsample_type\traw_reads\tHQ_reads\tHQ_ratio\n')
for sample,tabs in sample_reads.iteritems():
tabs = MyList(tabs)
out.write('%s\t%s\n'%(sample,str(tabs)))
out.close() def raw_stat_thread(fq_file,lock,compact,sample_name,tabs,out):
global total_reads
# sys.stderr.write('thread %s stat with %s %s\n'%(threading.currentThread().ident,compact,sample_name))
raw_reads = fq_reads_num(fq_file)
lock.acquire()
total_reads += raw_reads
data_type = tabs.pop(0)
ratio = int(tabs[1]) / raw_reads * 100
tabs = str(MyList(tabs))
out.write('%s\t%s\t%s\t%s\t%s\t%2.2f%%\n'%(compact,sample_name,data_type,raw_reads,tabs,ratio))
lock.release()
# sys.stderr.write('thread %s finished doing with %s %s\n'%(threading.currentThread().ident,compact,sample_name)) total_reads = 0
def reads_stat_all(work_path,original_path):
global total_reads
sys.stderr.write('\nmerge stat is begin ... \n')
out = open('%s/reads_stat.xls'%work_path,'w')
compact_hash = {}
for f in os.listdir(work_path):
compact = f.strip()
compact_path = '%s/%s'%(work_path,compact)
if not os.path.isdir(compact_path):
continue
if not compact_hash.has_key(compact):
compact_hash[compact] = {}
for stat_file,sample_name in parse_stat_files(compact_path):
for tabs in parse_stat(stat_file):
compact_hash[compact][sample_name] = tabs
out.write('compact\tsample_name\tdata_type\traw_reads\tpandaseq_reads\tHQ_reads\tratio\n')
lock = Lock()
active_threads = set()
for compact,sample in compact_hash.iteritems():
sys.stderr.write('doing with %s stat\n'%compact)
for sample_name,tabs in sample.iteritems():
original_fq = os.popen('ls %s/%s/%s/*'%(original_path,compact,sample_name)).read().strip().split('\n').pop(0)
t = Thread(target=raw_stat_thread,args=(original_fq,lock,compact,sample_name,tabs,out))
active_threads.add(t)
t.start()
while True:
if threading.activeCount() < cpu_count():
break
out.flush()
for t in threading.enumerate():
if t in active_threads:
sys.stderr.write('thread %s is still alive, wait ...\n'%t.ident)
t.join()
sys.stderr.write('Unaligned stating ...\n')
out.write('\n###\n')
unalign_fq = os.popen('ls %s/Unalign/*'%original_path).read().strip().split('\n').pop(0)
unalign_reads = fq_reads_num(unalign_fq)
total_reads += unalign_reads
ratio = unalign_reads / total_reads * 100
out.write('Unalign\t%s\t%2.2f%%\n'%(unalign_reads,ratio))
out.close()
sys.stderr.write('merge stat is all finished!\n\n') def pandaseq(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out):
cmd = '%s -F -f %s -r %s -w %s -p %s -q %s -g %s -l 220 -L 500'%(pandaseq_soft,read1,read2,fa_out,f_primer,r_primer,log_out)
os.system(cmd) def sampleMerge(sample_list,data_type,file_path,outfile):
outhandle = open(outfile,'w')
# sys.stderr.write('Begin merge into %s\n'%file_path)
reads_num = {}
f_template = '%s/%s/high_quality.fq'
for sample in sample_list:
f = f_template%(file_path,sample)
sample = re.sub('[-_]','.',sample)
sample = '%s%s'%(data_type,sample)
if not reads_num.has_key(sample):
reads_num[sample] = 0
for record in SeqIO.parse(open(f),'fastq'):
reads_num[sample] += 1
outhandle.write('>%s_%s\n%s\n'%(sample,reads_num[sample],str(record.seq)))
outhandle.close()
sys.stderr.write('merge file: %s is finished\n'%outfile) def get_lib_method(file):
file = os.path.basename(file)
if re.match('^sam_barcode.l$',file):
lib_method = 'Self'
elif re.match('^sam_barcode.s\d+$',file):
lib_method = 'HXT'
else:
lib_method = None
return lib_method
settings.py中包含不同建库方式的引物序列。
这个程序也算是前几天的学习成果展示了
Python并发实践_03_并发实战之一的更多相关文章
- Python机器学习实践与Kaggle实战(转)
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5 ...
- Python并发实践_01_线程与进程初探
进程与线程 在多任务处理中,每一个任务都有自己的进程,一个任务会有很多子任务,这些在进程中开启线程来执行这些子任务.一般来说,可以将独立调度.分配的基本单元作为线程运行,而进程是资源拥有的基本单位. ...
- Python 多线程教程:并发与并行
转载于: https://my.oschina.net/leejun2005/blog/398826 在批评Python的讨论中,常常说起Python多线程是多么的难用.还有人对 global int ...
- Golang 高效实践之并发实践context篇
前言 在上篇Golang高效实践之并发实践channel篇中我给大家介绍了Golang并发模型,详细的介绍了channel的用法,和用select管理channel.比如说我们可以用channel来控 ...
- Appium+python自动化(三十七)- 士兵突击许三多 - 多个appium服务启动,多个设备启动,多进程并发启动设备-并发测试 - 下(超详解)
简介 接着上一篇继续看一下如何并发测试以及并发测试的过程中,可能遇到的问题,在这里宏哥把宏哥遇到的和小伙伴或者童鞋们,一起分享一下. Appium端口检测 问题思考 经过前面学习,我们已经能够使用py ...
- python之socketserver实现并发
python之socketserver实现并发 服务端 import socketserver #socketserver模块是用来实现并发 # 我们自己的类里一定要继承socketserver.Ba ...
- python多进程并发和多线程并发和协程
为什么需要并发编程? 如果程序中包含I/O操作,程序会有很高的延迟,CPU会处于等待状态,这样会浪费系统资源,浪费时间 1.Python的并发编程分为多进程并发和多线程并发 多进程并发:运行多个独立的 ...
- python 使用多进程实现并发编程/使用queue进行进程间数据交换
import time import os import multiprocessing from multiprocessing import Queue, pool ""&qu ...
- 【Scala】Scala多线程-并发实践
Scala多线程-并发实践 scala extends Thread_百度搜索 scala多线程 - 且穷且独立 - 博客园 Scala和并发编程 - Andy Tech Talk - ITeye博客 ...
随机推荐
- PHP进程锁
<?php /** * CacheLock 进程锁,主要用来进行cache失效时的单进程cache获取,防止过多的SQL请求穿透到数据库 * 用于解决PHP在并发时候的锁控制,通过文件/eacc ...
- PHP如何强制下载文件
很多网站都需要做文件下载的功能.如果直接给连接的方式下载的话有很多的弊处...因为有时候需要对下载权限的检查,对下载次数的检查.所以一般采用php的方法进行安全下载.但是下载的时候如果是txt jpg ...
- 小实例---关于input宽度自适应以及多个input框合并拆分
前两个月,公司内部需要开发关于大数据方面的辅助工具语料分词系统,在这个项目中遇到以下几个主要问题,在此分享~ 一.input宽度根据内定文本宽度自适应 背景:项目需求中,前台展示,需要从后台获取的.t ...
- svn服务器的搭建与使用二
转载出处 上一篇介绍了VisualSVN Server和TortoiseSVN的下载,安装,汉化.这篇介绍一下如何使用VisualSVN Server建立版本库,以及TortoiseSVN的使用. 首 ...
- C++ 函数对象
函数对象 c++中函数名后的()称为函数调用运算符.函数调用运算符也可以重载,如果某个类重载了函数调用运算符,则该类的实例就是一个函数对象.函数对象本身并不是很有用,但他们使得算法操作的参数化策略成为 ...
- gRPC异步处理应答
gRPC异步处理应答 (金庆的专栏) gRPC的演示样例 greeter_async_client.cc 不算是异步客户端,它使用了异步请求.可是堵塞式等待应答,结果成为一个同步调用. std::st ...
- 相似QQ对话框上下部分可拖动代码
<!DOCTYPE html> <html> <head> <meta http-equiv="Content-Type" content ...
- Android使用XUtils框架上传照片(一张或多张)和文本,server接收照片和文字(无乱码)
Android上传图片,这里我使用了如今比較流行的XUtils框架.该框架能够实现文件上传.文件下载.图片缓存等等,有待研究. 以下是Android端上传的代码: xUtils.jar下载 Strin ...
- thinkphp5基础
前面的话 ThinkPHP是一个免费开源的,快速.简单的面向对象的轻量级PHP开发框架,是为了敏捷WEB应用开发和简化企业应用开发而诞生的.ThinkPHP从诞生以来一直秉承简洁实用的设计原则,在保持 ...
- 存储与索引------《Designing Data-Intensive Applications》读书笔记3
在上一篇的笔记之中,我们讨论了数据模型和查询语言.在第三章之中我们来聊一聊不同的数据引擎内部是如何实现存储和检索的,以及不同设计之间的折中与妥协. 1.键值对数据库 键值对数据库是数据库形式之中最简单 ...