1、测试数据下载:ftp://ftp.ensemblgenomes.org/pub/plants/release-44/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz
2、
[root@PC1 test2]# ls
Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz
[root@PC1 test2]# gunzip Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz
[root@PC1 test2]# ls
Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3
[root@PC1 test2]# mv Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3 a.txt ## 测试数据
[root@PC1 test2]# ls
a.txt
3、
[root@PC1 test2]# ls
a.txt test.py
[root@PC1 test2]# cat test.py ## 提取信息脚本
in_file = open("a.txt", "r")
out_file = open("result.txt", "w")
for i in in_file:
i = i.strip()
if i.startswith("#"):
continue
else:
tmp = i.split("\t")
if int(tmp[0]) == 1 and tmp[2] == "gene" and int(tmp[3]) > 100000 and int(tmp[4]) < 500000:
gene = tmp[8].split(";")[0].split("=")[1]
final = tmp[0] + "\t" + tmp[3] + "\t" + tmp[4] + "\t" + gene
out_file.write(final + "\n")
in_file.close()
out_file.close()
[root@PC1 test2]# python test.py ## 运行程序
[root@PC1 test2]# ls
a.txt result.txt test.py
[root@PC1 test2]# head result.txt ## 查看结果
1 104440 105330 gene:AT1G01250
1 108946 111699 gene:AT1G01260
1 112263 113947 gene:AT1G01280
1 114202 116407 gene:AT1G01290
1 116784 118845 gene:AT1G01300
1 119381 119997 gene:AT1G01305
1 120154 121130 gene:AT1G01310
1 121067 130577 gene:AT1G01320
1 130736 130858 gene:AT1G01335
1 132270 135924 gene:AT1G01340