[med-svn] [rsem] branch master updated (af9d30e -> 144ed09)

Michael Crusoe

2015-02-11 23:32:25 UTC

This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to branch master
in repository rsem.

commit 86047e0632df389d25d8bbe3705a0a5a3e7ad254
Author: Michael R. Crusoe <***@msu.edu>
Date: Wed Feb 11 15:54:01 2015 -0500

remove bundled samtools
---
debian/control | 2 +-
debian/copyright | 9 +-
debian/patches/fix-sam-includes | 124 ++
debian/patches/ignore-sam | 101 +
debian/patches/series | 2 +
debian/rules | 4 +-
debian/watch | 33 +-
sam/.DS_Store | Bin 21508 -> 0 bytes
sam/.gitignore | 4 -
sam/AUTHORS | 20 -
sam/COPYING | 21 -
sam/ChangeLog.old | 3875 ---------------------------------------
sam/INSTALL | 30 -
sam/Makefile.mingw | 63 -
sam/NEWS | 836 ---------
sam/bam.c | 474 -----
sam/bam.h | 793 --------
sam/bam2bcf.c | 467 -----
sam/bam2bcf.h | 67 -
sam/bam2bcf_indel.c | 498 -----
sam/bam2depth.c | 143 --
sam/bam_aux.c | 217 ---
sam/bam_cat.c | 185 --
sam/bam_color.c | 145 --
sam/bam_endian.h | 42 -
sam/bam_import.c | 489 -----
sam/bam_index.c | 726 --------
sam/bam_lpileup.c | 198 --
sam/bam_mate.c | 128 --
sam/bam_md.c | 389 ----
sam/bam_pileup.c | 437 -----
sam/bam_plcmd.c | 606 ------
sam/bam_reheader.c | 62 -
sam/bam_rmdup.c | 206 ---
sam/bam_rmdupse.c | 159 --
sam/bam_sort.c | 571 ------
sam/bam_stat.c | 77 -
sam/bam_tview.c | 368 ----
sam/bam_tview.h | 75 -
sam/bam_tview_curses.c | 297 ---
sam/bam_tview_html.c | 349 ----
sam/bamshuf.c | 141 --
sam/bamtk.c | 119 --
sam/bcftools/._Makefile | Bin 222 -> 0 bytes
sam/bcftools/._README | Bin 222 -> 0 bytes
sam/bcftools/._bcf.c | Bin 222 -> 0 bytes
sam/bcftools/._bcf.h | Bin 222 -> 0 bytes
sam/bcftools/._bcf.tex | Bin 222 -> 0 bytes
sam/bcftools/._bcf2qcall.c | Bin 222 -> 0 bytes
sam/bcftools/._bcfutils.c | Bin 222 -> 0 bytes
sam/bcftools/._call1.c | Bin 222 -> 0 bytes
sam/bcftools/._em.c | Bin 222 -> 0 bytes
sam/bcftools/._fet.c | Bin 222 -> 0 bytes
sam/bcftools/._index.c | Bin 222 -> 0 bytes
sam/bcftools/._kfunc.c | Bin 222 -> 0 bytes
sam/bcftools/._kmin.c | Bin 222 -> 0 bytes
sam/bcftools/._kmin.h | Bin 222 -> 0 bytes
sam/bcftools/._main.c | Bin 222 -> 0 bytes
sam/bcftools/._mut.c | Bin 222 -> 0 bytes
sam/bcftools/._prob1.c | Bin 222 -> 0 bytes
sam/bcftools/._prob1.h | Bin 222 -> 0 bytes
sam/bcftools/._vcf.c | Bin 222 -> 0 bytes
sam/bcftools/._vcfutils.pl | Bin 222 -> 0 bytes
sam/bcftools/Makefile | 51 -
sam/bcftools/README | 36 -
sam/bcftools/bcf.c | 396 ----
sam/bcftools/bcf.h | 197 --
sam/bcftools/bcf.tex | 77 -
sam/bcftools/bcf2qcall.c | 91 -
sam/bcftools/bcfutils.c | 504 -----
sam/bcftools/call1.c | 633 -------
sam/bcftools/em.c | 310 ----
sam/bcftools/fet.c | 112 --
sam/bcftools/index.c | 336 ----
sam/bcftools/kfunc.c | 162 --
sam/bcftools/kmin.c | 209 ---
sam/bcftools/kmin.h | 46 -
sam/bcftools/main.c | 191 --
sam/bcftools/mut.c | 127 --
sam/bcftools/prob1.c | 988 ----------
sam/bcftools/prob1.h | 49 -
sam/bcftools/vcf.c | 249 ---
sam/bcftools/vcfutils.pl | 567 ------
sam/bedcov.c | 127 --
sam/bedidx.c | 162 --
sam/bgzf.c | 694 -------
sam/bgzf.h | 207 ---
sam/bgzip.c | 206 ---
sam/cut_target.c | 193 --
sam/errmod.c | 130 --
sam/errmod.h | 24 -
sam/examples/._00README.txt | Bin 222 -> 0 bytes
sam/examples/._Makefile | Bin 222 -> 0 bytes
sam/examples/._bam2bed.c | Bin 222 -> 0 bytes
sam/examples/._calDepth.c | Bin 222 -> 0 bytes
sam/examples/._chk_indel.c | Bin 222 -> 0 bytes
sam/examples/._ex1.fa | Bin 222 -> 0 bytes
sam/examples/._ex1.sam.gz | Bin 222 -> 0 bytes
sam/examples/._toy.fa | Bin 222 -> 0 bytes
sam/examples/._toy.sam | Bin 222 -> 0 bytes
sam/examples/00README.txt | 23 -
sam/examples/Makefile | 50 -
sam/examples/bam2bed.c | 51 -
sam/examples/calDepth.c | 62 -
sam/examples/chk_indel.c | 83 -
sam/examples/ex1.fa | 56 -
sam/examples/ex1.sam.gz | Bin 114565 -> 0 bytes
sam/examples/toy.fa | 4 -
sam/examples/toy.sam | 14 -
sam/faidx.c | 437 -----
sam/faidx.h | 103 --
sam/kaln.c | 486 -----
sam/kaln.h | 67 -
sam/khash.h | 528 ------
sam/klist.h | 96 -
sam/knetfile.c | 632 -------
sam/knetfile.h | 75 -
sam/kprobaln.c | 280 ---
sam/kprobaln.h | 49 -
sam/kseq.h | 235 ---
sam/ksort.h | 285 ---
sam/kstring.c | 212 ---
sam/kstring.h | 169 --
sam/misc/._HmmGlocal.java | Bin 222 -> 0 bytes
sam/misc/._Makefile | Bin 222 -> 0 bytes
sam/misc/._ace2sam.c | Bin 222 -> 0 bytes
sam/misc/._bamcheck.c | Bin 222 -> 0 bytes
sam/misc/._blast2sam.pl | Bin 222 -> 0 bytes
sam/misc/._bowtie2sam.pl | Bin 222 -> 0 bytes
sam/misc/._export2sam.pl | Bin 222 -> 0 bytes
sam/misc/._interpolate_sam.pl | Bin 222 -> 0 bytes
sam/misc/._maq2sam.c | Bin 222 -> 0 bytes
sam/misc/._md5.c | Bin 222 -> 0 bytes
sam/misc/._md5.h | Bin 222 -> 0 bytes
sam/misc/._md5fa.c | Bin 222 -> 0 bytes
sam/misc/._novo2sam.pl | Bin 222 -> 0 bytes
sam/misc/._plot-bamcheck | Bin 222 -> 0 bytes
sam/misc/._psl2sam.pl | Bin 222 -> 0 bytes
sam/misc/._r2plot.lua | Bin 222 -> 0 bytes
sam/misc/._sam2vcf.pl | Bin 222 -> 0 bytes
sam/misc/._samtools.pl | Bin 222 -> 0 bytes
sam/misc/._soap2sam.pl | Bin 222 -> 0 bytes
sam/misc/._varfilter.py | Bin 222 -> 0 bytes
sam/misc/._vcfutils.lua | Bin 222 -> 0 bytes
sam/misc/._wgsim.c | Bin 222 -> 0 bytes
sam/misc/._wgsim_eval.pl | Bin 222 -> 0 bytes
sam/misc/._zoom2sam.pl | Bin 222 -> 0 bytes
sam/misc/HmmGlocal.java | 178 --
sam/misc/Makefile | 69 -
sam/misc/ace2sam.c | 249 ---
sam/misc/bamcheck.c | 1521 ---------------
sam/misc/blast2sam.pl | 92 -
sam/misc/bowtie2sam.pl | 92 -
sam/misc/export2sam.pl | 545 ------
sam/misc/interpolate_sam.pl | 125 --
sam/misc/maq2sam.c | 173 --
sam/misc/md5.c | 296 ---
sam/misc/md5.h | 57 -
sam/misc/md5fa.c | 58 -
sam/misc/novo2sam.pl | 281 ---
sam/misc/plot-bamcheck | 882 ---------
sam/misc/psl2sam.pl | 65 -
sam/misc/r2plot.lua | 83 -
sam/misc/sam2vcf.pl | 270 ---
sam/misc/samtools.pl | 528 ------
sam/misc/soap2sam.pl | 109 --
sam/misc/varfilter.py | 205 ---
sam/misc/vcfutils.lua | 694 -------
sam/misc/wgsim.c | 419 -----
sam/misc/wgsim_eval.pl | 91 -
sam/misc/zoom2sam.pl | 97 -
sam/padding.c | 479 -----
sam/phase.c | 687 -------
sam/razf.c | 853 ---------
sam/razf.h | 134 --
sam/razip.c | 141 --
sam/sam.c | 186 --
sam/sam.h | 99 -
sam/sam_header.c | 810 --------
sam/sam_header.h | 48 -
sam/sam_view.c | 441 -----
sam/sample.c | 107 --
sam/sample.h | 17 -
sam/samtools.1 | 1066 -----------
sam/win32/._xcurses.h | Bin 222 -> 0 bytes
sam/win32/._zconf.h | Bin 222 -> 0 bytes
sam/win32/._zlib.h | Bin 222 -> 0 bytes
sam/win32/xcurses.h | 1377 --------------
sam/win32/zconf.h | 332 ----
sam/win32/zlib.h | 1357 --------------
190 files changed, 235 insertions(+), 39701 deletions(-)

diff --git a/debian/control b/debian/control
index a70ec05..f904307 100644
--- a/debian/control
+++ b/debian/control
@@ -3,7 +3,7 @@ Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-***@lists.alioth.debian.org>
Uploaders: Andreas Tille <***@debian.org>
-Build-Depends: debhelper (>= 9), zlib1g-dev, libncurses5-dev
+Build-Depends: debhelper (>= 9), zlib1g-dev, libncurses5-dev, libbam-dev
Standards-Version: 3.9.6
Vcs-Browser: http://anonscm.debian.org/gitweb/?p=debian-med/rsem.git
Vcs-Git: git://anonscm.debian.org/debian-med/rsem.git
diff --git a/debian/copyright b/debian/copyright
index e89ffde..8ac9582 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,12 +1,9 @@
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: <pkg>
-Source: <path_to_download>
+Upstream-Name: RSEM
+Source: http://deweylab.biostat.wisc.edu/rsem/src/rsem-1.2.19.tar.gz
Exclude: sam/

Files: *
Copyright: © 20xx-20yy <upstream>
-License: <license>
+License: GPL

-Files: debian/*
-Copyright: © 2014 maintainername <***@e.mail>
-License: <license>
diff --git a/debian/patches/fix-sam-includes b/debian/patches/fix-sam-includes
new file mode 100644
index 0000000..d0cdc83
--- /dev/null
+++ b/debian/patches/fix-sam-includes
@@ -0,0 +1,124 @@
+--- rsem.orig/BamConverter.h
++++ rsem/BamConverter.h
+@@ -8,8 +8,8 @@
+ #include<map>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+ #include "sam_rsem_aux.h"
+ #include "sam_rsem_cvt.h"
+
+--- rsem.orig/BamWriter.h
++++ rsem/BamWriter.h
+@@ -10,8 +10,8 @@
+ #include<iostream>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+ #include "sam_rsem_aux.h"
+ #include "sam_rsem_cvt.h"
+
+--- rsem.orig/SamParser.h
++++ rsem/SamParser.h
+@@ -8,8 +8,8 @@
+ #include<cassert>
+ #include<string>
+
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+
+ #include "utils.h"
+ #include "my_assert.h"
+--- rsem.orig/bc_aux.h
++++ rsem/bc_aux.h
+@@ -4,7 +4,7 @@
+ #include<map>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
++#include "samtools/bam.h"
+
+ struct SingleEndT {
+ bam1_t *b;
+--- rsem.orig/sam_rsem_aux.h
++++ rsem/sam_rsem_aux.h
+@@ -5,7 +5,7 @@
+ #include<cstring>
+ #include<stdint.h>
+
+-#include "sam/bam.h"
++#include "samtools/bam.h"
+
+ // dwt: duplicate without text
+ bam_header_t *bam_header_dwt(const bam_header_t *ori_h)
+--- rsem.orig/sam_rsem_cvt.h
++++ rsem/sam_rsem_cvt.h
+@@ -4,7 +4,7 @@
+ #include<vector>
+
+ #include "stdint.h"
+-#include "sam/bam.h"
++#include "samtools/bam.h"
+
+ #include "Transcript.h"
+ #include "Transcripts.h"
+--- rsem.orig/getUnique.cpp
++++ rsem/getUnique.cpp
+@@ -6,8 +6,8 @@
+ #include<vector>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+
+ #include "utils.h"
+
+--- rsem.orig/samValidator.cpp
++++ rsem/samValidator.cpp
+@@ -6,8 +6,8 @@
+ #include<set>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+
+ #include "utils.h"
+ #include "my_assert.h"
+--- rsem.orig/scanForPairedEndReads.cpp
++++ rsem/scanForPairedEndReads.cpp
+@@ -7,8 +7,8 @@
+ #include<algorithm>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+
+ #include "utils.h"
+ #include "my_assert.h"
+--- rsem.orig/wiggle.cpp
++++ rsem/wiggle.cpp
+@@ -4,8 +4,8 @@
+ #include <iostream>
+
+ #include <stdint.h>
+-#include "sam/bam.h"
+-#include "sam/sam.h"
++#include "samtools/bam.h"
++#include "samtools/sam.h"
+
+ #include "utils.h"
+ #include "wiggle.h"
diff --git a/debian/patches/ignore-sam b/debian/patches/ignore-sam
new file mode 100644
index 0000000..2f9c507
--- /dev/null
+++ b/debian/patches/ignore-sam
@@ -0,0 +1,101 @@
+--- rsem.orig/Makefile
++++ rsem/Makefile
+@@ -7,9 +7,6 @@
+
+ all : $(PROGRAMS)
+
+-sam/libbam.a :
+- cd sam ; ${MAKE} all
+-
+ Transcript.h : utils.h
+
+ Transcripts.h : utils.h my_assert.h Transcript.h
+@@ -48,13 +45,13 @@
+ HitContainer.h : GroupInfo.h
+
+
+-SamParser.h : sam/sam.h sam/bam.h utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
++SamParser.h : utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
+
+
+-rsem-parse-alignments : parseIt.o sam/libbam.a
+- $(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz -lpthread
++rsem-parse-alignments : parseIt.o
++ $(CC) -o rsem-parse-alignments parseIt.o /usr/lib/libbam.a -lz -lpthread
+
+-parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp
++parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcripts.h parseIt.cpp
+ $(CC) -Wall -O2 -c -I. parseIt.cpp
+
+
+@@ -76,36 +73,32 @@
+
+ HitWrapper.h : HitContainer.h
+
+-sam_rsem_aux.h : sam/bam.h
+-
+-sam_rsem_cvt.h : sam/bam.h Transcript.h Transcripts.h
++sam_rsem_cvt.h : Transcript.h Transcripts.h
+
+-BamWriter.h : sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h
++BamWriter.h : sam_rsem_aux.h sam_rsem_cvt.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h
+
+ sampling.h : boost/random.hpp
+
+ WriteResults.h : utils.h my_assert.h GroupInfo.h Transcript.h Transcripts.h RefSeq.h Refs.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h
+
+-rsem-run-em : EM.o sam/libbam.a
+- $(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread
++rsem-run-em : EM.o
++ $(CC) -o rsem-run-em EM.o /usr/lib/libbam.a -lz -lpthread
+
+-EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h [...]
++EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h simul.h sam_rsem_aux.h sampling.h boost/ra [...]
+ $(CC) $(COFLAGS) EM.cpp
+
+-bc_aux.h : sam/bam.h
+-
+-BamConverter.h : utils.h my_assert.h sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h bc_aux.h Transcript.h Transcripts.h
++BamConverter.h : utils.h my_assert.h sam_rsem_aux.h sam_rsem_cvt.h bc_aux.h Transcript.h Transcripts.h
+
+-rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp sam/libbam.a
+- $(CC) -O3 -Wall tbam2gbam.cpp sam/libbam.a -lz -lpthread -o $@
++rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp
++ $(CC) -O3 -Wall tbam2gbam.cpp /usr/lib/libbam.a -lz -lpthread -o $@
+
+-rsem-bam2wig : utils.h my_assert.h wiggle.h wiggle.o sam/libbam.a bam2wig.cpp
+- $(CC) -O3 -Wall bam2wig.cpp wiggle.o sam/libbam.a -lz -lpthread -o $@
++rsem-bam2wig : utils.h my_assert.h wiggle.h wiggle.o bam2wig.cpp
++ $(CC) -O3 -Wall bam2wig.cpp wiggle.o /usr/lib/libbam.a -lz -lpthread -o $@
+
+-rsem-bam2readdepth : utils.h my_assert.h wiggle.h wiggle.o sam/libbam.a bam2readdepth.cpp
+- $(CC) -O3 -Wall bam2readdepth.cpp wiggle.o sam/libbam.a -lz -lpthread -o $@
++rsem-bam2readdepth : utils.h my_assert.h wiggle.h wiggle.o bam2readdepth.cpp
++ $(CC) -O3 -Wall bam2readdepth.cpp wiggle.o /usr/lib/libbam.a -lz -lpthread -o $@
+
+-wiggle.o: sam/bam.h sam/sam.h wiggle.cpp wiggle.h
++wiggle.o: wiggle.cpp wiggle.h
+ $(CC) $(COFLAGS) wiggle.cpp
+
+ rsem-simulate-reads : simulation.o
+@@ -130,14 +123,14 @@
+ calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h calcCI.cpp
+ $(CC) $(COFLAGS) calcCI.cpp
+
+-rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a
+- $(CC) -O3 -Wall getUnique.cpp sam/libbam.a -lz -lpthread -o $@
++rsem-get-unique : getUnique.cpp
++ $(CC) -O3 -Wall getUnique.cpp /usr/lib/libbam.a -lz -lpthread -o $@
+
+-rsem-sam-validator : sam/bam.h sam/sam.h my_assert.h samValidator.cpp sam/libbam.a
+- $(CC) -O3 -Wall samValidator.cpp sam/libbam.a -lz -lpthread -o $@
++rsem-sam-validator : my_assert.h samValidator.cpp
++ $(CC) -O3 -Wall samValidator.cpp /usr/lib/libbam.a -lz -lpthread -o $@
+
+-rsem-scan-for-paired-end-reads : sam/bam.h sam/sam.h my_assert.h scanForPairedEndReads.cpp sam/libbam.a
+- $(CC) -O3 -Wall scanForPairedEndReads.cpp sam/libbam.a -lz -lpthread -o $@
++rsem-scan-for-paired-end-reads : my_assert.h scanForPairedEndReads.cpp
++ $(CC) -O3 -Wall scanForPairedEndReads.cpp /usr/lib/libbam.a -lz -lpthread -o $@
+
+ ebseq :
+ cd EBSeq ; ${MAKE} all
diff --git a/debian/patches/series b/debian/patches/series
new file mode 100644
index 0000000..f5f8b23
--- /dev/null
+++ b/debian/patches/series
@@ -0,0 +1,2 @@
+ignore-sam
+fix-sam-includes
diff --git a/debian/rules b/debian/rules
index 3c54ae9..f49445e 100755
--- a/debian/rules
+++ b/debian/rules
@@ -19,5 +19,5 @@ override_dh_auto_install:
dh_install rsem_perl_utils.pm /usr/lib/perl5
dh_auto_install

-#get-orig-source:
-# . debian/get-orig-source
+get-orig-source:
+ uscan --verbose --force-download --repack --compression xz
diff --git a/debian/watch b/debian/watch
index 7fadcce..3631dcc 100644
--- a/debian/watch
+++ b/debian/watch
@@ -1,33 +1,4 @@
version=3

-# Uncomment to examine a Webpage
-# <Webpage URL> <string match>
-#http://www.example.com/downloads.php #PACKAGE#-(.*)\.tar\.gz
-
-# Uncomment to examine a Webserver directory
-#http://www.example.com/pub/#PACKAGE#-(.*)\.tar\.gz
-
-# Uncommment to examine a FTP server
-#ftp://ftp.example.com/pub/#PACKAGE#-(.*)\.tar\.gz debian uupdate
-
-# Uncomment to find new files on sourceforge
-# http://sf.net/#PACKAGE#/#PACKAGE#-(\d[\d\.]+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))
-
-# Uncomment to find new files on GooglePages
-# http://code.google.com/p/#PACKAGE#/downloads/list?can=1 \
-# .*/#PACKAGE#-([-.\d]+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))|zip)
-
-# Uncomment to find new files on Github
-# - when using releases:
-# https://github.com/#GITHUBUSER#/#PACKAGE#/releases .*/archive/#PREFIX#(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
-# - when using tags
-# https://github.com/#GITHUBUSER#/#PACKAGE#/tags .*/#PREFIX#(\d.*)\.(?:tgz|tbz2|txz|tar\.(?:gz|bz2|xz))
-# Remark: frequently you can do s/#PREFIX#/v?/ since 'v' or nothing is quite common but there are other prefixes possible
-
-# if tweaking of source is needed
-# \
-# debian debian/get-orig-source
-
-# if you need to repack and choose +dfsg prefix
-# opts=dversionmangle=s/[~\+]dfsg[0-9]*// \
-#
+opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g" \
+ https://github.com/bli25wisc/rsem/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
diff --git a/sam/.DS_Store b/sam/.DS_Store
deleted file mode 100644
index ee99731..0000000
Binary files a/sam/.DS_Store and /dev/null differ
diff --git a/sam/.gitignore b/sam/.gitignore
deleted file mode 100644
index bb605d4..0000000
--- a/sam/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*.o
-.*.swp
-*.a
-*.dSYM
diff --git a/sam/AUTHORS b/sam/AUTHORS
deleted file mode 100644
index 95afabb..0000000
--- a/sam/AUTHORS
+++ /dev/null
@@ -1,20 +0,0 @@
-Heng Li from the Sanger Institute wrote most of the initial source codes
-of SAMtools and various converters.
-
-Bob Handsaker from the Broad Institute is a major contributor to the
-SAM/BAM specification. He designed and implemented the BGZF format, the
-underlying indexable compression format for the BAM format. BGZF does
-not support arithmetic between file offsets.
-
-Jue Ruan for the Beijing Genome Institute designed and implemented the
-RAZF format, an alternative indexable compression format. RAZF supports
-arithmetic between file offsets, at the cost of increased index file
-size and the full compatibility with gzip. RAZF is optional and only
-used in `faidx' for indexing RAZF compressed fasta files.
-
-Colin Hercus updated novo2sam.pl to support gapped alignment by
-novoalign.
-
-Petr Danecek contributed the header parsing library sam_header.c and
-sam2vcf.pl script and added knet support to the RAZF library.
-
diff --git a/sam/COPYING b/sam/COPYING
deleted file mode 100644
index 82fa2f4..0000000
--- a/sam/COPYING
+++ /dev/null
@@ -1,21 +0,0 @@
-The MIT License
-
-Copyright (c) 2008-2009 Genome Research Ltd.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
diff --git a/sam/ChangeLog.old b/sam/ChangeLog.old
deleted file mode 100644
index 19aefae..0000000
--- a/sam/ChangeLog.old
+++ /dev/null
@@ -1,3875 +0,0 @@
-commit db2ad3e19068cbafde72ecde75d0638bbb3598ba
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 16 14:45:17 2012 -0500
-
- removed downsample.c
-
-commit 6c55c576903992c6fef148fe3b606fbc8bd10655
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 16 14:45:06 2012 -0500
-
- print to output
-
-commit db1044a34e6049c87eaa63c39ed6e56f03e7d4c1
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 16 14:39:34 2012 -0500
-
- removed sample
-
- Downsampling already exists in "view". View also keeps pairing while "sample" does not.
-
-commit ffdeed3e5d4a530bfdf6f9ba97fff0ba7add6cba
-Merge: 2daad7b accf026
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 16 14:22:15 2012 -0500
-
- Merge branch 'master' of github.com:lh3/samtools
-
-commit accf0260fd1117e10047344345d40b31a9ec31bb
-Merge: 9134e0d c554160
-Author: Heng Li <***@me.com>
-Date: Thu Feb 16 11:21:14 2012 -0800
-
- Merge pull request #8 from nh13/master
-
- Patches
-
-commit c554160df16ec7748cfdda4c7b54c641be7b809f
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 14:06:52 2012 -0500
-
- * more README.md work
-
-commit 2a81ffe349208d917666808fbc9f3041e0cb57de
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 14:06:10 2012 -0500
-
- * more README work
-
-commit fb3125f732715f62cded8685a23a002a96ce009b
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 14:05:19 2012 -0500
-
- * more README work
-
-commit 444d41002c37e1c3d0f9208b4a88126c47276386
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 14:02:13 2012 -0500
-
- * updating README
-
-commit dec53cb1043fe7efadfde75fa2fd39b76de22e54
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 13:55:01 2012 -0500
-
- updating the README for markdown syntax
-
-commit 798da18c346dca8ec6005582a0ddb1d5420b04ca
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 13:48:35 2012 -0500
-
- adding a README with the current differences between this repository and
- the official one
-
-commit 4d22d86c0f28636662f2144a88cd168e104c4275
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 13:35:03 2012 -0500
-
- adding "samtools sample" to the main
-
-commit 893c25a37c21005dc42f45d45e9ad78ddc5f29bb
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 13:33:51 2012 -0500
-
- * removing some compile flags to work with OS X
-
-commit 7ac22f72fdc32edd5c24af6baebfa7db5faf8e7b
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:47:14 2012 -0500
-
- Check write filehandle after opening for write. tamw/tamr is a union type, so change is only semantic.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit fef53330416631690f60fdff42b6e43d764170dc
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:44:59 2012 -0500
-
- Catch and report invalid BAM header, instead of segfaulting later on.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 5cc013fe4930bf9b6e7963aab1cd4a3c94f695bc
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:44:16 2012 -0500
-
- Add downsample to examples.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit b3fa9e7071532905a81dc7aa48eadc24b8c8846b
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:43:48 2012 -0500
-
- Adjust for leading hard clip on colorspace reads.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 1a9296c1389469d1c1db5b8069f0e11ffcc8abb2
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:42:52 2012 -0500
-
- Add samtools sample command, contributed by Davide Cittaro <***@ifom-ieo-campus.it>.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 2a804f3379748aeba944f1dec306dd726ff3235e
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:42:07 2012 -0500
-
- Add samtools qa command, contributed by Roman Valls Guimera <***@scilifelab.se>.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 0f3207fe8fd93e44d40fcf57204079c8c06d24a6
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:39:08 2012 -0500
-
- Makefile cleanup - allow CC, CFLAGS, LDFLAGS to be passed on make command line. Use LDFLAGS in samtools compile.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 6e7df604025f6a86881bf7f4a16f30e15d31538a
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:31:15 2012 -0500
-
- Allow max_mem for sort to be specified with units.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit f12ebcaf6e60d34180a27d70e09b743cef140b98
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:29:11 2012 -0500
-
- Allow user defined [lowercase] tags in header elements.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 50b931fa3312dc109537a4260698ddecd0f06a05
-Author: Jonathan Manning <***@lifetech.com>
-Date: Thu Feb 16 10:27:11 2012 -0500
-
- Check lowerbound in text entry box to avoid segfault in tview. Remove redundant call to bam_aux_get.
-
- Signed-off-by: Nils Homer <***@lifetech.com>
-
-commit 5e729da5190949a813d20d329eab7ddb661816bd
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 10:31:48 2012 -0500
-
- * fixing overflow/underflow in integer parsing
-
-commit fa50a4330b9abedaf07c26e13d31f05e57f1d319
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 10:30:40 2012 -0500
-
- * updating help message for samtools depth
-
-commit 79e52c9624b6dd3bdfdf439f4b4bc6f774c230a4
-Author: Nils Homer <***@lifetech.com>
-Date: Thu Feb 16 10:29:32 2012 -0500
-
- * adding support for outputting a circos histogram file in "samtools depth". Use
- the "-c/-B" options.
-
-commit 2daad7b52daa86561c0fb65fe366691fad9f5ed3
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 16 09:31:57 2012 -0500
-
- bugfix: wrong SP; missing DV in the VCF hdr
-
-commit 9134e0d5047c281ef3bd53da91771d4814a5131c
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 8 11:19:12 2012 -0500
-
- missing support of DV
-
-commit 34ebf12078c1d1015a0b8b9a9221243a60b22893
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 8 11:08:56 2012 -0500
-
- new BCF DV format: number of variant reads
-
-commit 9589d3312fa2d076f48bdd68e2a5edd419c8070c
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jan 10 10:30:27 2012 -0500
-
- scale depth to quality (hidden option)
-
-commit 704473e14668333ecaca5fb7b238af405c43e3b1
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jan 10 10:18:17 2012 -0500
-
- really nothing
-
-commit 01b307fd287962372bbf07461c88b54f41636817
-Author: Heng Li <***@live.co.uk>
-Date: Wed Dec 7 13:07:42 2011 -0500
-
- added an example containing 'B'
-
-commit c678791f0451ceb9205c1ab5c52c84641863c99a
-Author: Heng Li <***@live.co.uk>
-Date: Sat Dec 3 12:10:30 2011 -0500
-
- 'B' now moves backward w.r.t. the query
-
-commit 152119bc06a073933ca830e8e1407538e44626cc
-Author: Heng Li <***@live.co.uk>
-Date: Fri Dec 2 10:50:12 2011 -0500
-
- better consensus; a little more robust
-
-commit 454da4754ac503edda5b1329b67757d797e46e07
-Author: Heng Li <***@live.co.uk>
-Date: Fri Dec 2 00:20:22 2011 -0500
-
- in pileup call remove_B()
-
-commit ff2bcac1cc078ba1879f18c89cfae314439d7086
-Author: Heng Li <***@live.co.uk>
-Date: Fri Dec 2 00:17:32 2011 -0500
-
- working on a few toy examples
-
-commit 745ca7260158d6df7897b52598033ffb055a9e4f
-Author: Heng Li <***@live.co.uk>
-Date: Thu Dec 1 22:55:39 2011 -0500
-
- bam_remove_B(); not tested
-
-commit 07e4cdc7300abfcc82e03105b4689f95cab551cd
-Author: Heng Li <***@live.co.uk>
-Date: Thu Nov 10 12:58:55 2011 -0500
-
- baseQ threshold on plain pipleup; removed -E
-
-commit 322ebf2082dfa91df44b3a996d26c85357e5d5a2
-Author: Heng Li <***@live.co.uk>
-Date: Wed Oct 19 09:28:04 2011 -0400
-
- fixed two gcc warnings
-
-commit a632457b4c4adc50d833b56b5a5231feafaf8193
-Author: Heng Li <***@live.co.uk>
-Date: Tue Oct 4 10:13:23 2011 -0400
-
- change size_t to uint32_t in bam_header_t
-
- This may cause issues on 64-bit big-endian machines. Reported and fixed by Paolo Emilio Mazzon.
-
-commit af31bf5a78aea03baf6eb90fe50076549d499f6e
-Author: Heng Li <***@live.co.uk>
-Date: Mon Sep 26 20:17:57 2011 -0400
-
- rename pad2unpad to depad
-
-commit 77b198b73dfad1048e5d1c5a64aa75ee7b90f596
-Author: Heng Li <***@live.co.uk>
-Date: Fri Sep 23 01:22:40 2011 -0400
-
- convert padded BAM to unpadded BAM
-
-commit adb9e2342b7b7501d9527d3c23afab10469ae2c6
-Author: Heng Li <***@live.co.uk>
-Date: Wed Sep 7 11:40:50 2011 -0400
-
- generate template cigar with "fixmate"
-
-commit 46e5ab445a0fe880216cbc0daf1225725b569d7a
-Author: Heng Li <***@live.co.uk>
-Date: Fri Sep 2 12:50:18 2011 -0400
-
- update kseq.h to the latest version
-
-commit 68e9e4a73eb91405bb3e56bf0cdaf12d1b487abb
-Author: Heng Li <***@live.co.uk>
-Date: Fri Sep 2 12:44:45 2011 -0400
-
- Release samtools-0.1.18
-
-commit aa06bdadb2d109a79f927f478102f96a1f5fd258
-Author: Heng Li <***@live.co.uk>
-Date: Fri Sep 2 12:14:17 2011 -0400
-
- updated the revision number
-
-commit 267e1e1b6e54c0ab24f94cd9aee9cbd2d1923f9f
-Merge: 19ff1d3 aebab30
-Author: Heng Li <***@live.co.uk>
-Date: Fri Sep 2 12:13:08 2011 -0400
-
- Merge https://github.com/lh3/samtools into reduce
-
- Conflicts:
- bam_md.c
-
- Fixed a few typos in the merge
-
-commit aebab302399c24eaa6c5ab79d13d6bd5e2e9ea9a
-Merge: c2c63d0 da62663
-Author: Heng Li <***@live.co.uk>
-Date: Fri Sep 2 09:03:49 2011 -0700
-
- Merge pull request #4 from peterjc/x_equals2
-
- Implement basic support for =/X CIGAR operations
-
-commit 19ff1d3d7f47d7e61b121292aefe5a74bb8a18d2
-Author: Heng Li <***@live.co.uk>
-Date: Thu Aug 25 16:38:12 2011 -0400
-
- reduce BAM size (experimental)
-
-commit da626630fd98fd4e07ceb4d58c5c9a42d312a85d
-Author: peterjc <***@googlemail.com>
-Date: Mon Aug 22 06:58:08 2011 +0100
-
- Support =/X CIGAR operations (treated like M)
-
-commit 461d8003529db77a4d5ecbd108312e868b051a3d
-Author: peterjc <***@googlemail.com>
-Date: Mon Aug 22 05:52:56 2011 +0100
-
- Define CIGAR equals and X operationss (7 and 8)
-
-commit c2c63d067113baab41f3bc35fb28f4f00578accb
-Merge: 7ab3ef3 9a0ed9a
-Author: Heng Li <***@live.co.uk>
-Date: Thu Aug 18 17:21:54 2011 -0700
-
- Merge pull request #3 from peterjc/x_equals
-
- Accept SAM files using = in CIGAR (treats X and = as M)
-
-commit 9a0ed9a6b85c7981465f459300208dbd93e3c6f5
-Author: peterjc <***@googlemail.com>
-Date: Thu Aug 18 19:28:52 2011 +0100
-
- Accept SAM files using = in CIGAR (treats X and = as M)
-
-commit 7ab3ef388c1eb34d7912fd70cc5656c955240263
-Author: Heng Li <***@live.co.uk>
-Date: Mon Aug 8 10:22:22 2011 -0400
-
- bugfix: indexing takes huge memory
-
- This happens when an unmapped mate has coordinate 1. Thank Joel Martin for the fix.
-
-commit a3f6738593e944354a8f75306687d8b3acf08bf1
-Merge: a8bdca9 bc67ea2
-Author: Heng Li <***@live.co.uk>
-Date: Mon Aug 8 09:52:26 2011 -0400
-
- Merge branch 'master' of github.com:lh3/samtools
-
-commit bc67ea225da653f36a70b38382d6111dd494f659
-Author: Petr Danecek <***@sanger.ac.uk>
-Date: Thu Jul 28 20:03:16 2011 +0100
-
- Variant Distance Bias
-
-commit deb578f0c49d0b7d8c3bc6be220b4d67e2e7dfdf
-Author: Petr Danecek <***@sanger.ac.uk>
-Date: Tue Jul 26 09:57:37 2011 +0100
-
- If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but use the tag instead.
-
-commit a8bdca9cf482a637b89ee4f98469a93e0ab5e69b
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jul 25 10:10:55 2011 -0400
-
- bugfix: LRT2=nan
-
-commit 0afe33137d046a3e849eeb4a54590f27cbad4228
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 22 21:55:38 2011 -0400
-
- fixed a bug/typo
-
-commit 62d5849658c10222d40308c6b53ab4f99a448494
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 15 16:04:19 2011 -0400
-
- allow to set see in subsampling
-
-commit 5f46243824cc9435b167973e1d51e13128794ea1
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 15 15:54:47 2011 -0400
-
- support subsampling
-
-commit 5e55b6f34fc86cba7cf98d52ccaed405c3ffabbc
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 15 15:53:38 2011 -0400
-
- support indels
-
-commit f31c162926d6f43e8b60171789a258d02e1f9be5
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jul 7 17:02:33 2011 -0400
-
- do not count indel with "view -Y"
-
-commit e412dae587883b4c17e5fbf4b7c33f38bfa8458a
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jul 7 00:35:25 2011 -0400
-
- for WIN32 compatibility
-
-commit 70a52501bcfa63824749893a5ab8ed3c38e34958
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jul 7 00:32:46 2011 -0400
-
- for WIN32 compatibility
-
-commit 00438f14ed5984f08e8f7645a9b95644a812f969
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jul 6 23:41:45 2011 -0400
-
- fixed an uninitialized variable
-
-commit 7609c4a01059c326544b3d0142dfe9c4229d68c6
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jul 6 23:39:31 2011 -0400
-
- fixed an uninitialized variable
-
-commit cec7189a412f80ccb068a73bd28528915c16b0bf
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jul 6 22:53:19 2011 -0400
-
- Release samtools-0.1.17
-
-commit 93c06a249de3bb666029bf07b66de5e8e5e314fa
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jul 6 09:46:09 2011 -0400
-
- bugfix: incorrect idxstats for the last seq
-
- Again, this bug is caused by 3rd-party code for the sorting order checking.
-
-commit 84f6ca62db6e27b8c4c711e7b5f3ca704bf27b4f
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jul 5 23:30:23 2011 -0400
-
- output mapping quality in the old pileup format
-
-commit 362e05fd670886acaede69b864903d730b9db3ca
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jul 5 21:59:22 2011 -0400
-
- added a brief description of the VCF format
-
-commit e690a696468205e0cc4560016361c997660dd496
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jul 5 16:23:10 2011 -0400
-
- improved samtools manual page
-
-commit 362b4a1408ef3c32311d638aa8d85ce39c1c7b2d
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jul 5 15:58:29 2011 -0400
-
- merge bcftools.1 to samtools.1
-
-commit 643e0e61ba7266efbc9e5bfcb8e41f369ba2ce0a
-Author: Heng Li <***@live.co.uk>
-Date: Tue Jul 5 13:39:02 2011 -0400
-
- mpileup: when region set, set reference properly
-
-commit 613e4d67624a94f62563935fbd5cc294df69605a
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jul 4 23:29:02 2011 -0400
-
- compute the min PL diff
-
-commit 5b7d5d3f52b97ca42c8500eede808dab88a46a53
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jul 4 22:57:48 2011 -0400
-
- rename trio.c to mut.c
-
-commit 84fe96ad64b0365ead93a4115d1684b9bebb98fc
-Author: Heng Li <***@live.co.uk>
-Date: Sun Jul 3 15:38:51 2011 -0400
-
- added pair caller interface; not tested
-
-commit 2f2867b87b84c35319cc416d6173819d5c8a4e8c
-Author: Heng Li <***@live.co.uk>
-Date: Sun Jul 3 15:24:23 2011 -0400
-
- inital implementation of a pair caller
-
-commit e97653cf2ad653c95886933c42a2b5492ccab5ff
-Author: Heng Li <***@live.co.uk>
-Date: Sun Jul 3 00:06:28 2011 -0400
-
- convert bam to single-end fastq
-
-commit e8013e11f7a8db0a8d18c60d130169cca39bf2bd
-Author: Heng Li <***@live.co.uk>
-Date: Sat Jul 2 14:39:18 2011 -0400
-
- improve BED parsing
-
-commit 1025714325fdc636aeee47a76db8dafbbbfde64b
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 1 14:19:54 2011 -0400
-
- update the manual page
-
-commit 8022d0039dff47b1c11b2421357d510c1f28ae15
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 1 14:17:03 2011 -0400
-
- output the best constrained genotypes in trio
-
-commit 18c87295e12f5bebafdcae00d52000fb94c8a566
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jul 1 11:18:14 2011 -0400
-
- added documentations for view -T
-
-commit daf7a8d96bd495296bf7c7d99cddb808a3ced7d5
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 22:45:20 2011 -0400
-
- fixed a bug in writing SP
-
-commit e5c32bf9b28c6e3e861db88de56b5dbe11058b61
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 22:35:25 2011 -0400
-
- optionally output read positions in mpileup
-
-commit 1008051155ec994c1901e18f3eb03ea32a62e5d7
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 22:17:25 2011 -0400
-
- make faidx works with <2GB lines
-
-commit 2daebb63762425dd3074ddf71582ad189001e394
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 17:28:58 2011 -0400
-
- fixed an issue in the trio caller and the indel caller
-
-commit 9fdd52cf0716fb342a94946433d564b28b230835
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 13:34:01 2011 -0400
-
- Added trio caller; NOT tested yet
-
-commit ea22a8ed83625e9c82382b56acc42a2d9cfd17e5
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 11:42:29 2011 -0400
-
- convert PL to 10-likelihood GL
-
-commit 10d7065267b0d12c2bfcb6c70204fb6944cd395d
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jun 30 10:49:05 2011 -0400
-
- fix a compatibility issue with the new bcftools
-
-commit d340f01f609c61b719d38a6a55629a3fc899e1cd
-Author: Heng Li <***@live.co.uk>
-Date: Sun Jun 26 23:41:20 2011 -0400
-
- allow to ignore RG
-
-commit d6321faf98ebfe899b9409fb23c90a4aa8c6b542
-Author: Heng Li <***@live.co.uk>
-Date: Sun Jun 5 23:05:21 2011 -0400
-
- fixed a bug in SO checking due to a recent change
-
-commit bc995abf666d0c9ab4258f6c1b3518a45a89209f
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jun 3 14:45:36 2011 -0400
-
- update the version number
-
-commit 9e7cd83a08383858d008e0ccb2238a2b93831d6c
-Author: Heng Li <***@live.co.uk>
-Date: Fri Jun 3 14:43:12 2011 -0400
-
- smarter way to parse a region string
-
-commit e58a90a0fde54053dac65352b34c13c3fea815fc
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jun 1 14:36:22 2011 -0400
-
- output LRT2 instead of LRT1
-
-commit 08f78c9af3e5661f04f80bef424232de721dba03
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jun 1 14:02:28 2011 -0400
-
- genotype test, but assuming 1-degree
-
-commit 587b852340d7e60f6f7cf474a92ef77aeab46018
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jun 1 12:55:19 2011 -0400
-
- perform 2-degree test by default
-
-commit 3d38e403c5c830478b7eb157a484776997440501
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jun 1 12:44:34 2011 -0400
-
- fixed a typo; but the result is still not good
-
-commit 06291624f7dcc57445676f3be25d0bc355dd7110
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jun 1 12:24:18 2011 -0400
-
- fixed a typo
-
-commit 63b98aa33636b0d82a435bf49153c8c1502e7d42
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jun 1 12:23:37 2011 -0400
-
- added HWE+F<0 filter
-
-commit 37d926e8999999b593d0637ab7dc379dbd3d6006
-Author: Heng Li <***@live.co.uk>
-Date: Wed May 4 10:11:59 2011 -0400
-
- improved sorting order checking in index
-
- Patches from Jonathan Manning
-
-commit 1c2dc6762c5f7cd946046b53346513f2f9761dbf
-Author: Heng Li <***@live.co.uk>
-Date: Tue May 3 23:09:05 2011 -0400
-
- added r^2 estimate; added Brent's method
-
-commit c2d3bcd8f98e31668b5f1321222fbc6fd6336e75
-Author: Heng Li <***@live.co.uk>
-Date: Sun May 1 23:45:23 2011 -0400
-
- combine several utilites into vcfutils.lua
-
-commit be2e7362d7593ea4d03fb33cdb6af2aa096ca6c4
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 27 21:09:22 2011 -0400
-
- minor warning
-
-commit 683ef0443860813d743cf84fa86dda9bfaf5445a
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 27 10:10:38 2011 -0400
-
- added versioning
-
-commit ed72f25ec85671f7646dbc92fa7b5b1dda427f7d
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 27 10:04:02 2011 -0400
-
- Output ML allele count
-
-commit 2a9e36d2d6c405b2411ca47458f028ada8fe1000
-Author: Heng Li <***@live.co.uk>
-Date: Tue Apr 26 16:14:20 2011 -0400
-
- use ar -s
-
-commit 7a4f54e6dbcd7c94acbb3f1050a93f94b8a07949
-Author: Heng Li <***@live.co.uk>
-Date: Sat Apr 23 01:22:31 2011 -0400
-
- added another type of LRT
-
-commit b9c5e84762a4aacce3a3771b51ea80967c79a2e5
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 22 16:00:31 2011 -0400
-
- added version
-
-commit 8fad6677c5952efd67391581d64e67e02e7f6e68
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 22 00:30:19 2011 -0400
-
- remove the pileup command
-
-commit 3a962fb6ebf779de70f9e6effb2d8701a9aa3dd9
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 21 23:10:45 2011 -0400
-
- Release 0.1.16 (r963:234)
-
-commit b4d683cffbd98c43f05aff8610b37d63dd7e54aa
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 21 12:44:44 2011 -0400
-
- fixed a bug when coordinate-less reads are on the reverse strand
-
-commit c5ec45a128f409debc6a56a798024f53004037dc
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 20 11:36:52 2011 -0400
-
- added option '-f' to merge to avoid overwritting
-
-commit 68d431531370d24907c01a27f166f2341d7c4d35
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 20 10:26:58 2011 -0400
-
- do not print a warning
-
-commit 32922607e51ad2260c337eb022b9e4aedacb049f
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 20 10:21:06 2011 -0400
-
- Added ldpair to compute LD between requested pairs
-
-commit b8d6fa71b91678fa02338257e0707d1e5ca098dd
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 17 21:51:43 2011 -0400
-
- On a toy sample, type "B" seems to be accepted
-
-commit 0e7ee9a6bb4029184202aa6e6738105ba0c0510b
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 17 21:21:20 2011 -0400
-
- added type "B"; not tested yet
-
-commit a513dfad0ac0062b03871eb6ecf26cb8d18dc895
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 17 19:25:54 2011 -0400
-
- fixed a bug in bedidx.c: input BED not sorted
-
-commit de1e192bb0a8a762a54a6eee81d882fab01c3d32
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 17 18:51:08 2011 -0400
-
- by default, always perform posterior chi^2
-
-commit df6e0d1099895fc6cd7a19dc89fba95ed6654d35
-Author: Heng Li <***@live.co.uk>
-Date: Sat Apr 16 12:33:28 2011 -0400
-
- added debugging
-
-commit 8ce52e024dc2ef361dbd5399c232163055057e70
-Author: Heng Li <***@live.co.uk>
-Date: Sat Apr 16 00:59:05 2011 -0400
-
- avoid a segfault given wrong input
-
-commit e66b6684fc9a397f91ec29fdeecae9f8eb986a55
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 19:55:39 2011 -0400
-
- do not segfault when there is no PL
-
-commit 9ce3c584ec0cebfa45576f2ef538df4dad2b7e55
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 11:59:55 2011 -0400
-
- remove another unused part
-
-commit f53a051d68bf312ac8d5865210fae7a9808c0fb9
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 10:41:25 2011 -0400
-
- print G3 if HWE is small
-
-commit 4b2c08bb86ca4ed4959e4cb77a28f7d6fc19f5c9
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 10:04:34 2011 -0400
-
- fixed a bug
-
- actually not fix, but hide it
-
-commit 088e13c32453fb533b7bb1c65a573f9b90a23625
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 09:48:47 2011 -0400
-
- added LRT based permutation; not used though
-
-commit 1e3c2001afcb80b5eaa4c3f88df9da7b01b62524
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 09:28:55 2011 -0400
-
- Perform posterior contrast for small LRT
-
- Posterior contrast is much slower than LRT. Nonetheless, posterior P-value is
- more robust to sequencing artifacts. Thus we may combine the two to achieve a
- balance between speed and low FPR.
-
-commit 6f1b066270902198a7175ff6c1b05ebc8d1919be
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 15 01:36:06 2011 -0400
-
- Added Brent's method
-
-commit 3d061e5db25b67b25f6ff87afe4162e121354232
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 23:30:10 2011 -0400
-
- fixed a typo in printing
-
-commit 7fd14ceb5990bb350b8e97346ef3537d80058def
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 23:14:23 2011 -0400
-
- fixed a stupid bug
-
-commit f5b2c3459ec098b3cafd9619b9077132516baf58
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 22:42:35 2011 -0400
-
- separate EM and posterior
-
- Now, constrast is not performed unless -C is in use. EM can be invoked
- independently with -e without computing the posterior.
-
-commit 9eefcac963697fae554789b11ae3cb2c23f224d0
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 22:00:19 2011 -0400
-
- further code cleanup; prepare to add EM interface
-
-commit c2cce52355262743711e4742b0c8542bfcab1cdd
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 21:44:03 2011 -0400
-
- drop EM from prob1
-
-commit 24016f04bd3bdffb7eeb50cb25854f5007feb70f
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 21:08:33 2011 -0400
-
- drop posterior LRT; prepare for clean up
-
-commit 3670d8bd88c3eb22873f0a80e2a5913f64ca8c9a
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 20:57:43 2011 -0400
-
- better initial values for LD
-
-commit d48a8873c060b18b57799cfe3a0e5496ba069457
-Author: Heng Li <***@live.co.uk>
-Date: Thu Apr 14 20:36:25 2011 -0400
-
- finished EM
-
-commit b101f2db476188a950c23f5c1b6185fdb7f8f40b
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 13 01:19:04 2011 -0400
-
- genotype frequency estimate
-
-commit d79bdcbf6242ecfb8accba9ac9a22fbcbd543cf2
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 13 00:37:22 2011 -0400
-
- prepare for code clean up
-
-commit e0ce416abfc094f0c090957080b1404fd0edf752
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 13 00:34:15 2011 -0400
-
- rename ld.c to em.c
-
-commit 45ede3ad181f35c1be24bed5d75841e472357ab7
-Author: Heng Li <***@live.co.uk>
-Date: Wed Apr 13 00:22:10 2011 -0400
-
- implemeted EM likelihood ratio test
-
- The idea is learned from a brief chat with Rasmus Nielsen.
-
-commit 0454a346b60e42b75a2f742272089810279c7131
-Author: Heng Li <***@live.co.uk>
-Date: Tue Apr 12 15:45:52 2011 -0400
-
- added likelihood-ratio test (idea from Nick)
-
-commit f6287c8646c690440a1554c8958e7268f4134dc2
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 10 18:24:37 2011 -0400
-
- Release samtools-0.1.15 (r949:203)
-
-commit de6023f38f4d652438557cf7a0ac6eec324e7416
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 10 15:54:58 2011 -0400
-
- improved help information
-
-commit d3b337f2b7eda1e6f8f5575a19d1b5ed55cae279
-Author: Heng Li <***@live.co.uk>
-Date: Sat Apr 9 16:28:01 2011 -0400
-
- fixed a minor issue
-
-commit 82f6e4f49247e75fbd8ec08c285b8d3047b3d235
-Author: Heng Li <***@live.co.uk>
-Date: Sat Apr 9 15:49:04 2011 -0400
-
- separate QC-pass and QC-fail reads
-
-commit 8362b4a255081ee7ca0a4ca2eabc8c76758b6863
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 8 17:45:19 2011 -0400
-
- added verbose level
-
-commit f7bf419c290462be7d289249a4a6d28f825b4c93
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 8 16:08:14 2011 -0400
-
- fixed a bug
-
-commit 890cbb1ac93b3004fb6cf42ff47195077dcfc8ad
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 8 16:00:37 2011 -0400
-
- drop unrelated @RG when "-R" is in use
-
-commit a62dc929c950fb51311b705f5b5bfba8e3f704d7
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 8 16:00:14 2011 -0400
-
- skip header validation
-
-commit 39da810e2c56c8f0eff1ab726600b41f26d3d8e9
-Author: Heng Li <***@live.co.uk>
-Date: Tue Apr 5 23:52:22 2011 -0400
-
- change error message
-
-commit c0c50a34df250ef8a7a29b172058cd229be582b5
-Author: Heng Li <***@live.co.uk>
-Date: Tue Apr 5 23:50:46 2011 -0400
-
- fixed a bug caused by recent modifications
-
-commit 25226e8c468404cb5e1b5272efcea57e4193c762
-Author: Heng Li <***@live.co.uk>
-Date: Tue Apr 5 13:31:19 2011 -0400
-
- reduce the indel filtering window
-
-commit 5e18d7014437734f9dac9ab45a95e43ec2526101
-Author: Heng Li <***@live.co.uk>
-Date: Mon Apr 4 13:56:20 2011 -0400
-
- only output hwe if it is small enough
-
-commit 614941fb7dd276de662e7820eb8c7bae871a18cc
-Author: Heng Li <***@live.co.uk>
-Date: Mon Apr 4 13:34:02 2011 -0400
-
- added HWE back
-
-commit 7abe8825aa0bacccdeb38125934ae94d18f0ad4d
-Author: Heng Li <***@live.co.uk>
-Date: Mon Apr 4 12:46:24 2011 -0400
-
- EM estimate of genotype frequency
-
-commit 2bfeff9c645d177416664f1cb811e85cac3ff9e3
-Author: Heng Li <***@live.co.uk>
-Date: Mon Apr 4 11:29:12 2011 -0400
-
- minor
-
-commit 401e40647e7e3abbac6e4ec3d8bb68eb6f2d401b
-Author: Heng Li <***@live.co.uk>
-Date: Mon Apr 4 11:24:04 2011 -0400
-
- Added genotype freq estimate and association test
-
-commit 6cc226df6e3b480f1bd6e763ce8ef47f785bbb74
-Author: Heng Li <***@live.co.uk>
-Date: Sun Apr 3 20:57:23 2011 -0400
-
- minor changes
-
-commit 7e47a39630e812f09b80369f14606245976f687e
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 1 15:21:59 2011 -0400
-
- print the grayscale
-
-commit 2f675d9c0dde3c166c99e335fa17c7873a5ae8d5
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 1 08:55:16 2011 -0400
-
- change to comment
-
-commit 0592bb514994544ed84f51e509b233cf8821e0cf
-Author: Heng Li <***@live.co.uk>
-Date: Fri Apr 1 08:54:35 2011 -0400
-
- added base quality filtering
-
-commit fc1b47e04a7b94f6362c45856cbeb89d9d0b5ca5
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 31 23:31:14 2011 -0400
-
- fixed a few typos in comments
-
-commit 60be79bc8f0d24656e5e8a329af7e9b5b91d4c8b
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 31 23:13:23 2011 -0400
-
- comments
-
-commit 2432864acc25ebe5cee4217dbb0120439077a7f8
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 31 22:42:46 2011 -0400
-
- added bam2depth.c, a demo program
-
-commit 39625f7c6bea9ccbfd9af0feb22348d52079f012
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 31 16:37:22 2011 -0400
-
- added bgzf_check_bgzf() (used by tabix)
-
-commit 6de6bd3fb67fd22753a5f07d4cc25bf94e1b5a8c
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 31 16:37:08 2011 -0400
-
- fixed a bug in bedidx.c
-
-commit 3b9e257d25b2e81eed1625bc5d2882ed486ef20e
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 30 13:27:15 2011 -0400
-
- added bed support to bcftools
-
-commit 47bcce3d14ec4d205283b61e5e653803996c42e0
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 30 12:56:40 2011 -0400
-
- Added BED support to "samtools view"
-
-commit a812386017faedfc86c0e6562adbb2138329cfeb
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 30 12:47:04 2011 -0400
-
- support BED file
-
-commit 3052dddc929f1825e6e7f7f6f6724d9465d6cf9a
-Author: Heng Li <***@live.co.uk>
-Date: Mon Mar 28 15:51:55 2011 -0400
-
- relax RG matching; proper mismatching message
-
-commit f86d60c8fe25785523f01fae1486d2a6df4ee6ef
-Author: Heng Li <***@live.co.uk>
-Date: Sat Mar 26 10:38:23 2011 -0400
-
- Avoid reporting association when something unexpected, which I do not understand, happens.
-
-commit dd41e6b26fd9fe30218748b9a0a1f49bdb1862b9
-Author: Heng Li <***@live.co.uk>
-Date: Sat Mar 26 10:38:01 2011 -0400
-
- Added -1 to merge
-
-commit 4a0364b0d7f87f1c88d71ec5857a1f1d40710681
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 23 16:56:55 2011 -0400
-
- plot pairwise r^2
-
-commit 452629a711582e612bec22b3b082e234bd37039b
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 23 14:31:01 2011 -0400
-
- pairwise LD; case-control AF2
-
-commit 52862951adcaecde26ba8f0d9c1897944640a674
-Author: Heng Li <***@live.co.uk>
-Date: Mon Mar 21 23:03:14 2011 -0400
-
- Release samtools-0.1.14 (r933:170)
-
-commit 59a5a8ba8e2940f0e38238f9339f02c91a8a0ce4
-Author: Heng Li <***@live.co.uk>
-Date: Mon Mar 21 13:52:55 2011 -0400
-
- optionally skip loci with too low sample coverage
-
-commit 6434264b5c69514d4fafe62cbd30b3bbaddc1d41
-Author: Heng Li <***@live.co.uk>
-Date: Sat Mar 19 14:38:25 2011 -0400
-
- mpileup support Illumina1.3+ quality; skip non-variant sites when "view -v" is in use
-
-commit 5f59e01987e1d5eca7d6359cae64a9734b18beea
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 18 17:19:18 2011 -0400
-
- update version to r933:167
-
-commit 4d2c3c950910aa3d2c87760c3532e458fe01c0fa
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 18 16:25:01 2011 -0400
-
- added "-1" to the command-line help
-
-commit 55313a015a7bd6369cf5a66fed7fab2333201dc9
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 18 16:22:12 2011 -0400
-
- added the "cat" command (by Chris Saunders)
-
-commit b670272cadf3efa4dc456ac4c76104f73477d60d
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 18 15:59:46 2011 -0400
-
- support varying the compression level
-
-commit c5dd3c9ca5f75f880e52c8cd2beae983bcb8d3b1
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 16 14:33:45 2011 -0400
-
- update the manual pages
-
-commit 12fb4b596dc51bccd154fc4bd0593442f7937a46
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 16 12:49:26 2011 -0400
-
- update changelog
-
-commit e7fe4fd66e02d60a1ca7952ad1938809e77729a9
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 16 12:10:05 2011 -0400
-
- do not call indels when the depth is very high
-
-commit 7455eeaa32b949bb3856f75810890aabf7cacb18
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 16 11:56:56 2011 -0400
-
- code clean up
-
-commit 5f16679e54ced8e67a75d949f9175c50480b914e
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 15 14:45:24 2011 -0400
-
- when -s is specified, change the sample order
-
-commit 7ba95adee09d3b06a7eaf797d25efef837e592f5
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 15 14:11:42 2011 -0400
-
- compute the rank in permutation
-
-commit d219783cea7643fc7e10e1bd3a98e9b3165b4506
-Author: Heng Li <***@live.co.uk>
-Date: Sun Mar 13 21:35:13 2011 -0400
-
- I have found a SERIOUS BUG!!!
-
-commit 8e20d04ecdac1a7788eef71c4bb91b8479cf7150
-Author: Heng Li <***@live.co.uk>
-Date: Sun Mar 13 17:04:04 2011 -0400
-
- optionally shuffle samples in a BCF (debugging)
-
-commit fc7b261f181f2a411427bc9ee5d586c883ca9cdc
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 11 09:34:20 2011 -0500
-
- fixed a bug
-
-commit b3bbcc3d40994ae85705ab6fef9866ec8c142201
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 10 20:25:59 2011 -0500
-
- use mode instead of mean
-
-commit f1161262d137098a19143b5cb0de810e5db3243e
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 10 20:09:16 2011 -0500
-
- start from the mean instead of the mode
-
-commit 2ba56f5e99e90674855c4ffc8bf583340b932e1e
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 10 17:13:34 2011 -0500
-
- fixed an error in Chi^2 test
-
-commit b4ce7ae400290bc43dd287240479667f99b3b11e
-Author: Heng Li <***@live.co.uk>
-Date: Thu Mar 10 00:23:39 2011 -0500
-
- minor
-
-commit 8487fa5d3a73a43443964e731ea2a4c873c9d4e5
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 21:33:19 2011 -0500
-
- added -F to accept BCFs generated by old samtools
-
-commit fd51d2093f7fd775a7eaaeea57fa34716ab59ac2
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 17:39:09 2011 -0500
-
- update version
-
-commit b6da54335df943015a998a934075331b467abb5b
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 17:37:14 2011 -0500
-
- compute pseudo-chi2 probability
-
-commit 9f73cefdb8935421d872b989dd98fbc8e1295029
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 15:54:04 2011 -0500
-
- remove a comment which is wrong
-
-commit b10b1e47ece522e97ab8ef23417bcb6454f8b9db
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 15:51:12 2011 -0500
-
- clean up
-
-commit 353bfae2c6ff59205bd9223db04084cf7f507f01
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 15:45:29 2011 -0500
-
- for backup
-
-commit 53915d1c6410c2537d18bfa8eb8c657a2233c35e
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 15:27:56 2011 -0500
-
- having debugging code
-
-commit 0d0dbf66995b1511390d593981eae7b5d36fe17b
-Author: Heng Li <***@live.co.uk>
-Date: Wed Mar 9 14:58:23 2011 -0500
-
- temporary backup
-
-commit 5b74a174a8b637dee43b7f30250df6fb96580e12
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 8 15:46:11 2011 -0500
-
- the output makes sense, but there may be a typo...
-
-commit d81ec654b6c0c1eef6b0625d96f14b3155cee7c6
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 8 15:19:09 2011 -0500
-
- added contrast2(); fixed a bug in haploid mode
-
-commit 0cfd896fad5f7737cca49efa94a11892dafcd812
-Author: Heng Li <***@live.co.uk>
-Date: Mon Mar 7 21:40:17 2011 -0500
-
- fixed a bug in haploid genotyping
-
-commit ccd52155ef61273f2b42ad9c7b31ff1915f81b24
-Author: Heng Li <***@live.co.uk>
-Date: Sat Mar 5 18:10:35 2011 -0500
-
- fixed a few bugs; still not fully working
-
-commit edc3af753f96f831968ae32f2e0f915b74f31e6e
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 4 17:31:33 2011 -0500
-
- drop HWE calculation
-
-commit 92dac194debb66ca0718c21c871822dda2dd5bc1
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 4 17:28:35 2011 -0500
-
- implemented hap/dipoind mode; probably BUGGY!
-
-commit 7f26804bc27937e36fdc967e5c76514653ea40f5
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 4 16:01:27 2011 -0500
-
- read ploidy
-
-commit e7b7213475b5e61a69aab77ffb02b4983c8e7678
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 4 14:12:14 2011 -0500
-
- added math notes
-
-commit 46023e2f21321da83fc8e83e9229757a4e821acb
-Author: Heng Li <***@live.co.uk>
-Date: Fri Mar 4 13:34:10 2011 -0500
-
- update BCF spec
-
-commit 13190c49eeb006ad7013b7f1e9fc1b3beca3ae78
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 1 14:45:19 2011 -0500
-
- Release samtools-0.1.13 (r926:134)
-
-commit be8fabbb6001d9fd5263a70a3e21ed6dfe5a9837
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 1 14:07:15 2011 -0500
-
- prepare to finalize 0.1.13
-
-commit 1e8c753660978bed7e9289fe50becd596d9314bb
-Author: Heng Li <***@live.co.uk>
-Date: Tue Mar 1 09:40:17 2011 -0500
-
- allow to change whether to drop ambiguous reads
-
-commit 412210bfdb46606023f2e4b9086f2787f0cf1c62
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 22:01:29 2011 -0500
-
- revert to the old behavior of phase
-
-commit 46035589518cf84738de8666b866e2619457c1fb
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 16:46:23 2011 -0500
-
- change version number
-
-commit 7f40c33e37fc16fcb0a375ce46ae1d09cafb6d50
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 16:37:42 2011 -0500
-
- bugfix in indel calling: interger overflow
-
-commit 75849470efbe30042e5ddd516f9bcbe3b9bf6062
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 15:35:47 2011 -0500
-
- fixed a typo
-
-commit 9e6fb569885f906fabaab7fc2f02eae82f4bd602
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 15:34:09 2011 -0500
-
- minor changes to heuristic rules
-
-commit 30a799a91f5e2c10b761aa5437f902c6649fceb3
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 15:20:26 2011 -0500
-
- fixed a bug in the latest change
-
-commit e21ba9df950ea37f5c1b35c2af9ba9a4e0bba02a
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 12:47:06 2011 -0500
-
- put version in bam.h
-
-commit 918b14780c1dceb39c7010638ecd61c626e17166
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 12:00:38 2011 -0500
-
- frag_t::phased==0 reads are dumped to chimera.bam
-
-commit 657293c7bdba3ac69f53cd1ffa2874ed8756475e
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 11:05:29 2011 -0500
-
- change default -q to 37 (previously 40)
-
-commit 33d8d3bea76e466798ea322d68d34deb8d2dff06
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 28 10:39:57 2011 -0500
-
- fixed a minor bug in BAM reading
-
-commit daa25d426d42465d76c7317c95772bbb36bb3f47
-Author: Heng Li <***@live.co.uk>
-Date: Sat Feb 26 21:07:24 2011 -0500
-
- suppress gzopen64() warning
-
-commit 9cec4256eb9e7848d4711adb67b540659c141e32
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 22:14:52 2011 -0500
-
- fixed a long existing bug in vcf2fq
-
-commit 304487c83067a733add71cbc3886fa8c49f7ef2a
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 16:37:40 2011 -0500
-
- change version number
-
-commit 10ba6bf4f16692760f696f7b17f3719065786f77
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 16:34:08 2011 -0500
-
- Change the order of PL; change SP to int32_t
-
-commit c5cc2a8036a9c3579fbfde651efec4f6763b0228
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 14:52:03 2011 -0500
-
- claim X defined in the header
-
-commit 4ee8cb29f6092fd14a89f0cc5d3575112a204f39
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 14:40:24 2011 -0500
-
- minor changes
-
-commit 00065e9336a2831dc53bee7da2f4719845be1a2a
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 11:39:06 2011 -0500
-
- fixed an error in the BCF spec
-
-commit 1e2a73afcb72a02aa448718cb017c0438de89f90
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 11:36:40 2011 -0500
-
- update BCF spec
-
-commit dbf8eedaa38a405cb2fba5b3952b85776f51d035
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 11:28:43 2011 -0500
-
- update BCF spec
-
-commit eed1d91af9fad3c9d965333a55e623757f9c4e9d
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 09:51:39 2011 -0500
-
- fixed a flaw in targetcut
-
-commit 59bc980bb832b92a8b0cc244cf106e6150e4db6f
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 00:54:35 2011 -0500
-
- update manual page
-
-commit fcc4738c4abdca79e3de159e21208df1b98ac76c
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 00:45:39 2011 -0500
-
- update version format
-
-commit 5748639ae542b7f6b853562edc2bb3faf43030e4
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 00:45:12 2011 -0500
-
- update version number
-
-commit 06b44cc366cf27ce8976ee6a05810a0b3c48b56d
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 00:44:21 2011 -0500
-
- update version number
-
-commit ab7f4529d12739ff66fd4c09af9d992ab59c53ef
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 25 00:42:55 2011 -0500
-
- various help message
-
-commit a092e1f6f963272f8bb23616986ddaf604fd0f82
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 24 23:43:13 2011 -0500
-
- disable unfinished functionality
-
-commit f00a78db72b14ee4c6689fc13f20ed31aeaecd40
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 24 10:04:56 2011 -0500
-
- added "const" to bcf_p1_cal()
-
-commit 91049c4a8db3bf50dcc9d07506f22fa4ca5b5a96
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 23 11:53:47 2011 -0500
-
- randomly allocate unphased reads
-
-commit f4405354a8d4cb3441141fa734573031059d7f57
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 15:36:07 2011 -0500
-
- fixed a typo
-
-commit 3075e4dc5c7c9d954426aabda6a73fa788357100
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 15:33:40 2011 -0500
-
- make output more informative
-
-commit 628cf3235e2815a40acf089fb1d3357be6437787
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 14:50:06 2011 -0500
-
- change the scoring rule; change default k to 13
-
-commit f22fd99831e4b5c74f898719216f359dbe987bbf
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 14:45:15 2011 -0500
-
- update scoring in masking
-
-commit 2f23547b81984555032aa0eefd064b8e07986fdc
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 14:37:17 2011 -0500
-
- remove dropreg()
-
-commit 4d8b6b1f1f331ca9041983c66e34a857c3b8f1bb
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 13:10:16 2011 -0500
-
- accept files from stdin
-
-commit 9b50c5038e6fc0185e29ca5b50fe0806a9a939b9
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 22 11:16:57 2011 -0500
-
- fixed a bug in consensus generation
-
-commit 1332ab32fb788fdc81b2ba8653b905d106238fad
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 22:53:23 2011 -0500
-
- print dropped fragments
-
-commit a288761b4ca1584e51076a71cbc4d72fe923dda1
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 22:37:04 2011 -0500
-
- bugfix: singletons are not phased
-
-commit 683365f534c0223dea7d72532015ac16a45ba22b
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 17:27:10 2011 -0500
-
- output singleton blocks
-
-commit 841a4609084d81f1bc81e0b00dd806002461e7d9
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 15:58:55 2011 -0500
-
- fixed a bug; not working with -l right now
-
-commit fdd57ea31732b5516dc212d72174b60206952636
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 15:17:00 2011 -0500
-
- skip mapQ==0 reads
-
-commit 4eb6ba75c23c1c9be5f76814fa1b93a2e304b2af
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 14:03:03 2011 -0500
-
- print the "targetcut" command
-
-commit 0123d9559ba58b026c0dfd15bc26019a193cd21a
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 11:22:13 2011 -0500
-
- allow to set the maximum depth
-
-commit 0f92eb248a4d06645b2c3d736a0faea8a7a9f731
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 09:56:41 2011 -0500
-
- use a proper error model to call hets
-
-commit 587a01504af5aea6288740d121dccf48fb8a75f4
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 21 09:16:38 2011 -0500
-
- phase is UNFINISHED; strip RG when merging
-
-commit 723bf3cd79e4f4a558373d4c707fa6b3db0fb357
-Author: Heng Li <***@live.co.uk>
-Date: Sat Feb 19 23:38:11 2011 -0500
-
- use a proper model to compute consensus
-
-commit 891a6b02d4a9af2ed98fbaac4915bf1f0da4f6c8
-Author: Heng Li <***@live.co.uk>
-Date: Sat Feb 19 22:14:19 2011 -0500
-
- added comment
-
-commit 8b55e0a581ecc9e4ba754d1f3c8784f3038b6e48
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 18 17:23:39 2011 -0500
-
- change the output format
-
-commit 75c36e8c563eddd0a362ba3b38cf0aea21aafb1f
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 15 20:31:00 2011 -0500
-
- fixed a bug in writing BAM
-
-commit bb0ce52f066cfebaa35a125d57b353bb717a5165
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 23:39:09 2011 -0500
-
- skip uncovered; unknown alleles taken as X
-
-commit ba67f4d119c7d06907db3015d337d9a01a3fc9fe
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 23:21:19 2011 -0500
-
- fixed a bug
-
-commit e4448d49e6129a5e1ee9c7f04f43612f12d6aad6
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 22:43:09 2011 -0500
-
- prepare to read hets from a list; unfinished
-
-commit 129ea29c1f12177c0a7c3e21676f6210370fc59b
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 16:32:22 2011 -0500
-
- updated khash.h to 0.2.5
-
-commit 15b44ed93bd949dffcf79ac8dbea6d9b7dfcb58c
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 16:15:04 2011 -0500
-
- use the latest version of khash
-
-commit 486c05f06f44d981dfb2069bcb43e4b35fd8389c
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 15:04:40 2011 -0500
-
- change the default -k to 11
-
-commit 07cf9d1e443d73cf053de38dd01671e3781f6e29
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 14:50:51 2011 -0500
-
- sort fragments by vpos instead of by beg
-
-commit d0d3e7faabf5cbb7e5ff7b294f7e220da807c4c0
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 14:45:41 2011 -0500
-
- shuffling the two haplotypes for better randomness
-
-commit 3be28eaf5f6033229aedf12ddb11a0084ba01cd8
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 14:09:17 2011 -0500
-
- write chimeras to a separate BAM
-
-commit 80ccbc26f43918fe42be123cc1da9d3d7ce30816
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 13:54:13 2011 -0500
-
- no mem leak/violation on small files; correctness is not checked
-
-commit 5c923867432fa14c26a19e3782e7f48d4080f6ac
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 14 13:50:25 2011 -0500
-
- bam separation; at least not immediate segfault
-
-commit cea2643ec30a59735bf89b2f562b563bf7263e79
-Author: Heng Li <***@live.co.uk>
-Date: Sun Feb 13 23:24:11 2011 -0500
-
- on the way to implement BAM separation; unfinished
-
-commit 964269cd15036a470ca89e43d0952201a0825671
-Author: Heng Li <***@live.co.uk>
-Date: Sun Feb 13 18:07:56 2011 -0500
-
- keep singletons in the hash table
-
-commit 2d4aa649bd670d5e038a1acaefd33c5fe24ae0e8
-Author: Heng Li <***@live.co.uk>
-Date: Sun Feb 13 17:42:24 2011 -0500
-
- Revert "prepare to add bam separation"
-
- This reverts commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f.
-
-commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f
-Author: Heng Li <***@live.co.uk>
-Date: Sun Feb 13 00:24:28 2011 -0500
-
- prepare to add bam separation
-
-commit d211e652d93791d2e112d334added243ffe5fc3e
-Author: Heng Li <***@live.co.uk>
-Date: Sat Feb 12 18:50:20 2011 -0500
-
- accelerate kstrtok
-
-commit 2d6af49d331ff5afe7b9e9b102e79d7d4512fdbe
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 11 21:08:21 2011 -0500
-
- split unlinked blocks
-
-commit 68e4cd1b560b0a6fd4c77e5e51eadde9fda26ea4
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 11 10:47:58 2011 -0500
-
- remove heading and tailing ambiguous positions
-
-commit d2b685141426a902ae76660c1fbe8020da150cf8
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 11 10:02:21 2011 -0500
-
- code clean up for further features
-
-commit c6980e062d55928b59f287c03e599dd5a37ed509
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 11 08:00:08 2011 -0500
-
- change /64 to >>6; the latter is faster
-
-commit 91635b9c2687f24d72ee6a8aad2050a79bb8400f
-Merge: 41d4df2 9a7e155
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 11 01:22:55 2011 -0500
-
- Merge branch 'master' into devel
-
-commit 9a7e155cc591c1b6c9f7f9cb939364a6becb65b2
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 11 01:21:07 2011 -0500
-
- output an unrecognized field as '.'; autofix GL/PL
-
-commit 41d4df2e9545e9abe97151cfe5d6c763f3d00db1
-Merge: c00c41c aacce0c
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 23:00:14 2011 -0500
-
- Merge branch 'master' into devel
-
-commit aacce0ce7276f451e4fddf81832f9e5f7f65198b
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 22:57:53 2011 -0500
-
- finished VCF->BCF conversion
-
-commit 0e875df643e41d848b709e2fa877de8ae53cdd4c
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 21:57:28 2011 -0500
-
- fixed a bug in reading VCF files
-
-commit c00c41c2a5da69cccea64adb542a0b365e56b4fc
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 16:28:37 2011 -0500
-
- suppres one-allele blocks
-
-commit 2e2354b673722e2f00d72970a043f80a66270da1
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 16:06:56 2011 -0500
-
- fixed the bug in filtering
-
-commit d971e1fe24de4ecaf94055efffc5f641e2bdb563
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 12:24:23 2011 -0500
-
- prepare to add filtering; buggy right now
-
-commit a0a5a3fbf504c3b02f7b9212e72315c1047cc249
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 11:55:02 2011 -0500
-
- make masking optional
-
-commit 28db71ccd95054a5f8a47c2332794f8968f6a822
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 10 11:40:47 2011 -0500
-
- routine to mask poorly called regions
-
-commit a3f6c439262bc10a4067860440f4d4dde9e0c515
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 17:18:33 2011 -0500
-
- code clean up: remove globals
-
-commit 0b711978492f6ad39d459d78723c299468906818
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 16:52:54 2011 -0500
-
- output more information
-
-commit f69d217ae5b691bf42ad07a97f29a7cc6456046f
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 16:11:54 2011 -0500
-
- fixed another bug in flipping
-
-commit d47882d549337fbcc251597508a2c7faf1bb92e2
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 16:01:35 2011 -0500
-
- fixed a stupid bug in flipping
-
-commit e33f89de499496537f5fbde396a66557f0353f1b
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 15:54:42 2011 -0500
-
- fix chimeras; a little weird...
-
-commit 03d3c1d0b945245108ce0942d4772536a32212c7
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 13:27:35 2011 -0500
-
- no effective change; prepare to fix chimera
-
-commit 6bc0a4676dd2252085a6e67bb06daa5ae05a554f
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 11:52:58 2011 -0500
-
- better count output
-
-commit dcac515439d25f71125d6de8111da417776ab9ce
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 9 10:31:07 2011 -0500
-
- prepare for another way of filtering
-
-commit ca7e4f1899b86d2e077994c789e8f69d699b3cd9
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 8 16:10:08 2011 -0500
-
- fixed the bug; I can do better.
-
-commit 0733f77b98af121bdcb198cea6151d159831bb9c
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 8 15:55:38 2011 -0500
-
- fixed two bugs; still not working...
-
-commit 80f18cba9ba73c9592380fc1ecd53c351d294782
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 8 15:42:58 2011 -0500
-
- filter false SNPs; NOT working right now
-
-commit 69a66e2f96d5b102cd712ff1527a3802fa84c590
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 8 14:39:09 2011 -0500
-
- write sequence in the SAM format for debugging
-
-commit b6f1c9d160822af2b713be206f37bd6dde00546a
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 7 11:51:21 2011 -0500
-
- fixed two bugs
-
-commit 400aa5c06100af9c47cd5e4ce8b95b7deb84f54b
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 7 11:22:38 2011 -0500
-
- Optionally apply BAQ
-
-commit 4c82e0e19682e424f5cdb8381364114c307b329e
-Author: Heng Li <***@live.co.uk>
-Date: Mon Feb 7 01:23:31 2011 -0500
-
- improved output; the result makes sense at a glance
-
-commit dc7853a581ab24bcc496e96b123ccf637e32ed1d
-Author: Heng Li <***@live.co.uk>
-Date: Sun Feb 6 14:12:43 2011 -0500
-
- process per linked block instead of per chr
-
-commit e867d9c6c2e61d9e748e78163e5481dca5697a36
-Author: Heng Li <***@live.co.uk>
-Date: Sun Feb 6 00:45:46 2011 -0500
-
- DP seems to work on toy examples
-
-commit 445ad72fc43d4354d56f5f759790e8ae0be73d02
-Author: Heng Li <***@live.co.uk>
-Date: Sat Feb 5 01:24:42 2011 -0500
-
- implemented backtrack; not tested
-
-commit ba38e180b9cd545956583b22e97e09b4bb12073e
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 4 23:55:23 2011 -0500
-
- More "correct" DP; backtrack not implemented
-
-commit d69761fd9351273ccd37ea431b10509add91e7cf
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 4 17:22:31 2011 -0500
-
- scratch of dynamic programming; unfinished...
-
-commit 769ffcb44e26e59300791658801d321559b33858
-Author: Heng Li <***@live.co.uk>
-Date: Fri Feb 4 16:29:55 2011 -0500
-
- UNFINISHED commit.
-
-commit 9adab9591317c3467f3d8cdf2d19ec1f65d1b5b7
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 3 16:20:59 2011 -0500
-
- another way of counting; can be even faster
-
-commit bbafbdc01ed1ceaab44927def1ad47c4c78aeb9c
-Author: Heng Li <***@live.co.uk>
-Date: Thu Feb 3 14:48:20 2011 -0500
-
- for backup
-
-commit eba7446389cad62a19133bced1386a4334dcab79
-Merge: a44a98e f01a593
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 2 14:06:07 2011 -0500
-
- Merge branch 'master' into devel
-
-commit f01a5930445b5fda7e6b5b813ed63c652160ada2
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 2 11:31:54 2011 -0500
-
- Better truncation warning when EOF is absent
-
-commit dd3ee5ed26c8bbef4a62fa5b2bfb0a75833f2c31
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 2 10:38:28 2011 -0500
-
- fixed a typo in BCF/VCF headers
-
-commit b9d1137c55f401387113d1ad8a387489afe741db
-Author: Heng Li <***@live.co.uk>
-Date: Wed Feb 2 09:13:44 2011 -0500
-
- fixed an out-of-boundary bug (fixed by Roel Kluin)
-
-commit a44a98e16559b9672e8a3492c8f8c640074b7ee2
-Merge: ef68a14 d0443d5
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 1 21:54:48 2011 -0500
-
- Merge branch 'master' into devel
-
-commit d0443d5c2f648e0f69bd4c56eaac7868e501c18b
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 1 17:31:52 2011 -0500
-
- improved sorting order checking
-
-commit ef68a14fab91399b2ecd38345936c3d6e7391cf3
-Merge: 1e597b3 1a39a2e
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 1 15:12:37 2011 -0500
-
- Merge branch 'master' into devel
-
-commit 1a39a2eb08a270e20a34a0983e8bed6ffb3e2008
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 1 15:12:14 2011 -0500
-
- more precise error message
-
-commit e028e7a47c02232e06a9dd3009262c00dede1060
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 1 14:48:01 2011 -0500
-
- improved sorting order validation in index
-
-commit 1e597b3356744e2b791b12c9187f91c8054511d5
-Author: Heng Li <***@live.co.uk>
-Date: Tue Feb 1 14:44:27 2011 -0500
-
- testing only; not working
-
-commit 5753ace1e54228822d8ee95f69943f586e42f6e8
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jan 31 17:37:08 2011 -0500
-
- reduce the effect of seq errors at the cost of SN
-
-commit 6f239ce5e0abd47babee33174476d48b723260d8
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jan 31 17:29:34 2011 -0500
-
- added testing code
-
-commit 3db42fe22d27d61ab5735cd2308f73d93def8ebe
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jan 31 14:33:21 2011 -0500
-
- routine for phasing fosmid resequencing (incomplete)
-
-commit ed88f2797323229ae8f38fbcd107b231007956a8
-Author: Heng Li <***@live.co.uk>
-Date: Mon Jan 31 10:12:53 2011 -0500
-
- SAM output
-
-commit abc6acae28dc4794f6422255f077cf370d34e414
-Merge: f1985a9 b133dbf
-Author: Heng Li <***@live.co.uk>
-Date: Sat Jan 29 22:56:10 2011 -0500
-
- Merge branch 'master' into devel
-
-commit b133dbf82de4e8cea5eb56e5bbf0c4b3e9368fd5
-Author: Heng Li <***@live.co.uk>
-Date: Sat Jan 29 22:37:11 2011 -0500
-
- fixed a bug in tview on big-endian by Nathan Weeks
-
-commit 9d3fdaef29f91e21dbfcb9ff0165b9573e7c1042
-Author: Heng Li <***@live.co.uk>
-Date: Sat Jan 29 22:24:00 2011 -0500
-
- update INSTALL
-
-commit 9d074a38bde53961f96157b6fb3683b6dded38d7
-Author: Heng Li <***@live.co.uk>
-Date: Sat Jan 29 21:56:25 2011 -0500
-
- avoid a segfault when network connect fails
-
-commit f1985a93f7455b3ea1b0ef9b959d50b896ccd620
-Author: Heng Li <***@live.co.uk>
-Date: Sat Jan 29 21:53:18 2011 -0500
-
- fixed a bug about bit ordering
-
-commit d09797db6fef648a6823cbe718d67664660c6ebe
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 16:53:19 2011 -0500
-
- point out there are 4 or fewer free parameters
-
-commit 5fd1717650ed68ab6c55d094d1648c16a054891a
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 16:09:18 2011 -0500
-
- updated .gitignore
-
-commit fccb19fbe8f9de91f59d85bb49a248683dc6266c
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 16:08:14 2011 -0500
-
- fixed a bug; better scoring
-
-commit b4dcb844bde3d09eedcd9f6832186ece60ae5afd
-Merge: ffc3e89 6f502de
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 14:50:30 2011 -0500
-
- Merge branch 'master' into devel
-
-commit 6f502dec46b18dae4bb5b2319715d028b5e193d0
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 14:47:31 2011 -0500
-
- skip unmapped and ref-skip reads in indel calling
-
-commit 3639f37dd8257b24560c35effcc3b6c16c3c1bcb
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 14:19:15 2011 -0500
-
- fixed an out-of-boundary bug in rare cases
-
-commit ffc3e89678ab9052b84f403da1e43044b045e73f
-Author: Heng Li <***@live.co.uk>
-Date: Thu Jan 27 14:00:17 2011 -0500
-
- targetcut can be compiled, though probably buggy
-
-commit f452b3ac51306865ddde31a8d715b155d4d3e6e6
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jan 26 18:58:43 2011 -0500
-
- this is for a very special application...
-
-commit ca1451c6406c7ee757cb31349ea0b8de70db0656
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jan 26 18:48:09 2011 -0500
-
- fixed compiling errors
-
-commit 085b87a7642865f17239fb6a436e626e25417838
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jan 26 18:45:09 2011 -0500
-
- This script was put in a wrong place...
-
-commit 090d360828622520de60385af4928ce1aebe0e48
-Author: Heng Li <***@live.co.uk>
-Date: Wed Jan 26 18:33:58 2011 -0500
-
- Imported from samtools-r902
-------------------------------------------------------------------------
-r108 | lh3lh3 | 2009-01-20 11:56:45 +0000 (Tue, 20 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/examples/Makefile
-
-made it a little more convenient
-
-------------------------------------------------------------------------
-r107 | lh3lh3 | 2009-01-20 11:53:30 +0000 (Tue, 20 Jan 2009) | 2 lines
-Changed paths:
- A /branches/dev/samtools/examples/Makefile
-
-added a Makefile
-
-------------------------------------------------------------------------
-r106 | lh3lh3 | 2009-01-20 11:25:05 +0000 (Tue, 20 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/maq2sam.c
-
-support RG tag
-
-------------------------------------------------------------------------
-r105 | lh3lh3 | 2009-01-18 17:37:20 +0000 (Sun, 18 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/ChangeLog
-
-update changelog
-
-------------------------------------------------------------------------
-r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_lpileup.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-18
- * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized
-
-------------------------------------------------------------------------
-r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines
-Changed paths:
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-17
- * fixed a bug when there are reads without coordinates
- * also recognize type 'c' as 'A'
- * found a bug in bam_lpileup.c; NOT fixed yet
-
-------------------------------------------------------------------------
-r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines
-Changed paths:
- A /branches/dev/samtools/INSTALL
-
-Instruction for compilation
-
-------------------------------------------------------------------------
-r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- A /branches/dev/samtools/Makefile.lite
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/faidx.c
- M /branches/dev/samtools/misc/Makefile
- M /branches/dev/samtools/razf.c
-
- * replaced HAVE_RAZF with _NO_RAZF
- * added Makefile.lite for people who have trouble with razf.c
-
-------------------------------------------------------------------------
-r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_mate.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/misc/wgsim.c
-
- * samtools-0.1.1-15
- * fixed another bug in fixmate: unmapped pair has non-zero isize
-
-------------------------------------------------------------------------
-r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/ChangeLog
- M /branches/dev/samtools/bam_mate.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-14
- * fixed a bug in fixmate: isize not equal to zero if two ends mapped to
- different chr
-
-------------------------------------------------------------------------
-r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-13
- * fixed the prior for hom indels (Richard pointed this out)
-
-------------------------------------------------------------------------
-r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/COPYING
- M /branches/dev/samtools/bam_sort.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/source.dot
-
- * samtools-0.1.1-12
- * fixed a bug in sort
- * update source file graph and copyright information
-
-------------------------------------------------------------------------
-r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/glf.c
-
-fixed a typo
-
-------------------------------------------------------------------------
-r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/glf.c
-
-added a main function for glf.c
-
-------------------------------------------------------------------------
-r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/bgzf.h
- A /branches/dev/samtools/glf.c
- M /branches/dev/samtools/glf.h
-
- * samtools-0.1.1-11
- * generate binary GLFv2
- * added glfview command to dump GLFv2 binary file
-
-------------------------------------------------------------------------
-r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_rmdup.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/glf.h
-
- * samtools-0.1.1-10
- * fixed several bugs in rmdup
- * prepare to generate GLF2
-
-------------------------------------------------------------------------
-r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_import.c
- A /branches/dev/samtools/bam_rmdup.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-9
- * implemented rmdup; NOT tested yet
-
-------------------------------------------------------------------------
-r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/examples/00README.txt
-
-update README for typos
-
-------------------------------------------------------------------------
-r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/examples/ex1.sam.gz
-
-update example
-
-------------------------------------------------------------------------
-r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam.c
- A /branches/dev/samtools/bam_mate.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-8
- * added fixmate command
-
-------------------------------------------------------------------------
-r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-7
- * change the reported indel position to the previous way
-
-------------------------------------------------------------------------
-r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-6
- * addd glt output
- * allow to change indel calling parameters at the command line
-
-------------------------------------------------------------------------
-r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_pileup.c
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-5
- * added two more flags
- * allowed to select reads shown in pileup with a mask
-
-------------------------------------------------------------------------
-r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-4
- * fixed a bug in indexing (linear index)
- * prepare to add glt output from pileup
-
-------------------------------------------------------------------------
-r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-3
- * fixed a bug in outputing the coordinate of an indel
-
-------------------------------------------------------------------------
-r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-2
- * pileup: allows to output indel sites only
-
-------------------------------------------------------------------------
-r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bam_maqcns.h
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.1-1
- * implemented a Bayesian indel caller
-
-------------------------------------------------------------------------
-r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/examples/00README.txt
- D /branches/dev/samtools/examples/ex1.fa.fai
-
-Let users generate ex1.fa.fai.
-
-------------------------------------------------------------------------
-r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/bowtie2sam.pl
-
-make the bowtie converter works for "-k 2"
-
-------------------------------------------------------------------------
-r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/export2sam.pl
-
-fixed a bug for "QC" reads
-
-------------------------------------------------------------------------
-r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines
-Changed paths:
- A /branches/dev/samtools/misc/bowtie2sam.pl
- M /branches/dev/samtools/misc/soap2sam.pl
-
- * soap2sam.pl: added NM tag
- * bowtie2sam.pl: converter for bowtie
-
-------------------------------------------------------------------------
-r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/misc/soap2sam.pl
-
-soap2sam.pl: convert soap output to SAM
-
-------------------------------------------------------------------------
-r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/misc/wgsim_eval.pl
-
- * wgsim_eval.pl-0.1.1
- * fixed a bug for a contig name like "NT_012345"
-
-------------------------------------------------------------------------
-r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/misc/wgsim_eval.pl
-
- * evaluate alignment for reads generated by wgsim
-
-------------------------------------------------------------------------
-r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/Makefile
- M /branches/dev/samtools/misc/wgsim.c
-
-fixed compiling warnings for wgsim
-
-------------------------------------------------------------------------
-r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/bam_tview.c
-
-remove an unused variable (a compiler warning only)
-
-------------------------------------------------------------------------
-r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/Makefile
- A /branches/dev/samtools/misc/wgsim.c
-
-wgsim: Paired-end reads simulator
-
-------------------------------------------------------------------------
-r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bam_tview.c
-
-Move definition of bam_nt16_nt4_table so we can build without curses.
-
-------------------------------------------------------------------------
-r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/NEWS
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/samtools.1
-
-Release samtools-0.1.1
-
-------------------------------------------------------------------------
-r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines
-Changed paths:
- M /branches/dev/samtools/bam_aux.c
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bam_tview.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/razf.c
- M /branches/dev/samtools/samtools.1
-
- * samtools-0.1.0-66
- * fixed a bug in razf.c: reset z_eof when razf_seek() is called
- * fixed a memory leak in parsing a region
- * changed pileup a little bit when -s is in use: output ^ and $
- * when a bam is not indexed, output more meaningful error message
- * fixed a bug in indexing for small alignment
- * fixed a bug in the viewer when we come to the end of a reference file
- * updated documentation
- * prepare to release 0.1.1
-
-------------------------------------------------------------------------
-r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/examples
- A /branches/dev/samtools/examples/00README.txt
- A /branches/dev/samtools/examples/ex1.fa
- A /branches/dev/samtools/examples/ex1.fa.fai
- A /branches/dev/samtools/examples/ex1.sam.gz
-
-example
-
-------------------------------------------------------------------------
-r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/ChangeLog
-
-update ChangeLog
-
-------------------------------------------------------------------------
-r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/misc/export2sam.pl
-
- * added comments
- * fixed several bugs
-
-------------------------------------------------------------------------
-r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/misc/export2sam.pl
-
-convert Export format to SAM; not thoroughly tested
-
-------------------------------------------------------------------------
-r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines
-Changed paths:
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bam_tview.c
- M /branches/dev/samtools/bamtk.c
- A /branches/dev/samtools/source.dot
-
- * samtools-0.1.0-65
- * pileup: generate maq-like simple output
- * pileup: allow to output pileup at required sites
- * source.dot: source file relationship graph
- * tview: fixed a minor bug
-
-------------------------------------------------------------------------
-r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines
-Changed paths:
- D /branches/dev/samtools/misc/all2sam.pl
-
-remove all2sam.pl
-
-------------------------------------------------------------------------
-r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/COPYING
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/faidx.h
- M /branches/dev/samtools/khash.h
- M /branches/dev/samtools/kseq.h
- M /branches/dev/samtools/ksort.h
- M /branches/dev/samtools/samtools.1
-
-Added copyright information and a bit more documentation. No code change.
-
-------------------------------------------------------------------------
-r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam.c
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-64
- * improved efficiency of the indel caller for spliced alignments
-
-------------------------------------------------------------------------
-r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam.c
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_aux.c
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-63
- * a bit code cleanup: reduce the dependency between source files
-
-------------------------------------------------------------------------
-r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-62
- * fixed a memory leak
-
-------------------------------------------------------------------------
-r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/ChangeLog
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/samtools.1
-
-update documentation, ChangeLog and a comment
-
-------------------------------------------------------------------------
-r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bam_maqcns.h
- M /branches/dev/samtools/bam_pileup.c
- A /branches/dev/samtools/bam_plcmd.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/samtools.1
-
- * samtools-0.1.0-61
- * moved pileup command to a separate source file
- * added indel caller
- * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!)
- * updated documentation
-
-------------------------------------------------------------------------
-r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-60
- * fixed another bug in maqcns when there is a nearby deletion
-
-------------------------------------------------------------------------
-r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bam_pileup.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-59
- * pileup: outputing consensus is now optional
- * fixed a bug in glfgen. This bug also exists in maq's glfgen. However,
- I am not quite sure why the previous version may have problem.
-
-------------------------------------------------------------------------
-r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines
-Changed paths:
- M /branches/dev/samtools/bam_pileup.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-58
- * add maq consensus to pileup. However, I will move this part to a new
- command as strictly speaking, consensus callin is not part of pileup,
- and imposing it would make it harder to generate for other language
- bindings.
-
-------------------------------------------------------------------------
-r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/bgzf.c
-
-Fix bug in tell() after reads that consume to the exact end of a block.
-
-------------------------------------------------------------------------
-r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/samtools.1
-
-update manual
-
-------------------------------------------------------------------------
-r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-57
- * fixed a bug in parser when there is auxiliary fields
- * made the parser a bit more robust
-
-------------------------------------------------------------------------
-r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines
-Changed paths:
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/bgzf.c
-
- * samtools-0.1.0-56
- * fixed a bug in bgzf (only reading is affected)
- * fixed a typo in bam_index.c
- * in bam_index.c, check potential bugs in the underlying I/O library
-
-------------------------------------------------------------------------
-r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/samtools.1
-
-update manual
-
-------------------------------------------------------------------------
-r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines
-Changed paths:
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_pileup.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-55
- * tried to make pileup work with clipping (previously not), though NOT tested
- * removed -v from pileup
- * made pileup take the reference sequence
-
-------------------------------------------------------------------------
-r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/samtools.1
-
- * samtools-0.1.0-54
- * in parser, recognize "=", rather than ",", as a match
- * in parser, correctl parse "=" at the MRNM field.
-
-------------------------------------------------------------------------
-r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/maq2sam.c
-
-fixed a bug in handling maq flag 64 and 192
-
-------------------------------------------------------------------------
-r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/md5fa.c
-
-also calculate unordered md5sum check
-
-------------------------------------------------------------------------
-r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/md5fa.c
-
-fixed a minor bug when there are space in the sequence
-
-------------------------------------------------------------------------
-r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/md5fa.c
-
-fixed a potential memory leak
-
-------------------------------------------------------------------------
-r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bamtk.c
-
- * fixed a bug in import: bin is wrongly calculated
-
-------------------------------------------------------------------------
-r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/misc/all2sam.pl
-
-nothing, really
-
-------------------------------------------------------------------------
-r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/kseq.h
- M /branches/dev/samtools/misc/Makefile
- A /branches/dev/samtools/misc/md5.c
- A /branches/dev/samtools/misc/md5.h
- A /branches/dev/samtools/misc/md5fa.c
-
- * fixed two warnings in kseq.h
- * added md5sum utilities
-
-------------------------------------------------------------------------
-r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bamtk.c
- A /branches/dev/samtools/kseq.h
- D /branches/dev/samtools/kstream.h
-
- * samtools-0.1.0-52
- * replace kstream with kseq. kseq is a superset of kstream. I need the
- extra functions in kseq.h.
- * also compile stand-alone faidx
-
-------------------------------------------------------------------------
-r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_sort.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-51
- * sorting by read names is available
-
-------------------------------------------------------------------------
-r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam.c
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bam_pileup.c
- M /branches/dev/samtools/bam_sort.c
- M /branches/dev/samtools/bam_tview.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/misc/maq2sam.c
-
- * samtools-0.1.0-50
- * format change to meet the latest specification
-
-------------------------------------------------------------------------
-r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/misc/maq2sam.c
-
- * minor change in maqcns: special care when n==0
- * change maq2sam to meet the latest specification
-
-------------------------------------------------------------------------
-r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/razf.c
- M /branches/dev/samtools/razf.h
-
-considerable code clean up in razf
-
-------------------------------------------------------------------------
-r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/ChangeLog
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/faidx.c
-
-make RAZF optional in faidx.c
-
-------------------------------------------------------------------------
-r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/bam_aux.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/samtools.1
-
- * samtools-0.1.0-49
- * added routines for retrieving aux data, NOT TESTED YET!
-
-------------------------------------------------------------------------
-r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines
-Changed paths:
- M /branches/dev/samtools/bam.c
- M /branches/dev/samtools/bam_import.c
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/bgzf.c
- M /branches/dev/samtools/samtools.1
-
- * samtools-0.1.0-48
- * bgzf: fixed a potential integer overflow on 32-it machines
- * maqcns: set the minimum combined quality as 0
- * supporting hex strings
-
-------------------------------------------------------------------------
-r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/bam_maqcns.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-47
- * fixed the bug in maqcns
-
-------------------------------------------------------------------------
-r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam.h
- A /branches/dev/samtools/bam_maqcns.c
- A /branches/dev/samtools/bam_maqcns.h
- M /branches/dev/samtools/bam_tview.c
- M /branches/dev/samtools/bamtk.c
- A /branches/dev/samtools/glf.h
-
- * samtools-0.1.0-46
- * add MAQ consensus caller, currently BUGGY!
-
-------------------------------------------------------------------------
-r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_pileup.c
- M /branches/dev/samtools/bam_tview.c
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-45
- * tview: display padded alignment (but not P operation)
- * better coordinates and reference sequence
-
-------------------------------------------------------------------------
-r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/ChangeLog
-
-new ChangeLog
-
-------------------------------------------------------------------------
-r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines
-Changed paths:
- D /branches/dev/samtools/ChangeLog
- A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6)
-
-Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from
-the log of my personal SVN repository.
-
-------------------------------------------------------------------------
-r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/bgzf.c
-
- * samtools-0.1.0-44
- * declare fseeko and ftello as some Linux may not do this by default and
- missing these declarations will make bgzf buggy
- * get rid of some harmless warings
- * use BGZF by default, now
-
-------------------------------------------------------------------------
-r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bam_index.c
- M /branches/dev/samtools/bamtk.c
- M /branches/dev/samtools/razf.c
-
- * samtools-0.1.0-43
- * fixed a bug in razf_read()
- * give more warnings when the file is truncated (or due to bugs in I/O library)
-
-------------------------------------------------------------------------
-r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/bgzf.c
-
-fixed a bug in bgzf.c at the end of the file
-
-------------------------------------------------------------------------
-r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines
-Changed paths:
- M /branches/dev/samtools/bamtk.c
-
- * samtools-0.1.0-42
- * a lot happened to RAZF, although samtools itself is untouched. Better
- also update the version number anyway to avoid confusion
-
-------------------------------------------------------------------------
-r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/razf.c
-
-a change from Jue, but I think it should not matter
-
-------------------------------------------------------------------------
-r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines
-Changed paths:
- M /branches/dev/samtools/razf.c
-
-fixed a potential bug in razf. However, it seems still buggy, just
-rarely happens, very rarely.
-
-------------------------------------------------------------------------
-r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/razf.c
-
-fixed a bug in razf, with the help of Jue
-
-------------------------------------------------------------------------
-r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/bam_index.c
-
-remove a comment
-
-------------------------------------------------------------------------
-r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines
-Changed paths:
- M /branches/dev/samtools/Makefile
- M /branches/dev/samtools/bam.h
- M /branches/dev/samtools/razf.c
- M /branches/dev/samtools/razf.h
-
- * Jue has updated razf to realize Bob's scheme
-
-------------------------------------------------------------------------
-r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools/samtools.1
-
-the manual page
-
-------------------------------------------------------------------------
-r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines
-Changed paths:
- A /branches/dev/samtools/ChangeLog
- A /branches/dev/samtools/Makefile
- A /branches/dev/samtools/bam.c
- A /branches/dev/samtools/bam.h
- A /branches/dev/samtools/bam_aux.c
- A /branches/dev/samtools/bam_endian.h
- A /branches/dev/samtools/bam_import.c
- A /branches/dev/samtools/bam_index.c
- A /branches/dev/samtools/bam_lpileup.c
- A /branches/dev/samtools/bam_pileup.c
- A /branches/dev/samtools/bam_sort.c
- A /branches/dev/samtools/bam_tview.c
- A /branches/dev/samtools/bamtk.c
- A /branches/dev/samtools/bgzf.c
- A /branches/dev/samtools/bgzf.h
- A /branches/dev/samtools/bgzip.c
- A /branches/dev/samtools/faidx.c
- A /branches/dev/samtools/faidx.h
- A /branches/dev/samtools/khash.h
- A /branches/dev/samtools/ksort.h
- A /branches/dev/samtools/kstream.h
- A /branches/dev/samtools/misc
- A /branches/dev/samtools/misc/Makefile
- A /branches/dev/samtools/misc/all2sam.pl
- A /branches/dev/samtools/misc/maq2sam.c
- A /branches/dev/samtools/razf.c
- A /branches/dev/samtools/razf.h
- A /branches/dev/samtools/razip.c
- A /branches/dev/samtools/zutil.h
-
-The initial version of samtools, replicated from my local SVN repository.
-The current version is: 0.1.0-42. All future development will happen here.
-
-------------------------------------------------------------------------
-r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines
-Changed paths:
- A /branches/dev/samtools
-
-samtools (C version)
-
-------------------------------------------------------------------------
-------------------------------------------------------------------------
-r703 | lh3 | 2008-11-25 20:20:02 +0000 (Tue, 25 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/samtools.1
-
-rename bamtk to samtools
-
-------------------------------------------------------------------------
-r702 | lh3 | 2008-11-25 20:15:09 +0000 (Tue, 25 Nov 2008) | 2 lines
-Changed paths:
- D /branches/prog/bam/bamtk.1
- A /branches/prog/bam/samtools.1 (from /branches/prog/bam/bamtk.1:679)
-
-rename bamtk.1 to samtools.1
-
-------------------------------------------------------------------------
-r701 | lh3 | 2008-11-25 13:29:10 +0000 (Tue, 25 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/misc/Makefile
-
- * samtools-0.1.0-41
- * small (but a bit dangerous) changes to meet the latest specification
-
-------------------------------------------------------------------------
-r700 | lh3 | 2008-11-25 13:15:11 +0000 (Tue, 25 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/misc/all2sam.pl (from /branches/prog/bam/misc/all2tam.pl:649)
- D /branches/prog/bam/misc/all2tam.pl
- A /branches/prog/bam/misc/maq2sam.c (from /branches/prog/bam/misc/maq2tam.c:699)
- D /branches/prog/bam/misc/maq2tam.c
-
-rename tam to sam
-
-------------------------------------------------------------------------
-r699 | lh3 | 2008-11-25 13:14:49 +0000 (Tue, 25 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/misc/maq2tam.c
-
-change for the new specification
-
-------------------------------------------------------------------------
-r698 | lh3 | 2008-11-24 13:15:20 +0000 (Mon, 24 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/razf.c
- M /branches/prog/bam/razf.h
-
- * add a fake BGZF mode to razf. It is fake in that it loads razf index into
- memory but gives BGZF like virtual offset
-
-------------------------------------------------------------------------
-r697 | lh3 | 2008-11-24 09:53:44 +0000 (Mon, 24 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/ChangeLog
-
-change log
-
-------------------------------------------------------------------------
-r696 | lh3 | 2008-11-24 09:53:23 +0000 (Mon, 24 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bgzf.c
-
-updated bgzf, on behalf of Bob
-
-------------------------------------------------------------------------
-r695 | lh3 | 2008-11-23 11:40:31 +0000 (Sun, 23 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/razf.c
-
-fixed a bug in razf
-
-------------------------------------------------------------------------
-r694 | lh3 | 2008-11-22 16:23:52 +0000 (Sat, 22 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_lpileup.c
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bam-0.1.0-40
- * fixed two small memory leaks
- * fixed a memory problem when seek outside the length of the sequence
-
-------------------------------------------------------------------------
-r693 | lh3 | 2008-11-22 16:10:04 +0000 (Sat, 22 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bam-0.1.0-39
- * fixed an uninitialized warning. This does not matter in fact
-
-------------------------------------------------------------------------
-r692 | lh3 | 2008-11-22 15:44:05 +0000 (Sat, 22 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/razf.c
- M /branches/prog/bam/razf.h
-
-Jue's new razf
-
-------------------------------------------------------------------------
-r691 | lh3 | 2008-11-21 21:30:39 +0000 (Fri, 21 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/bgzip.c
-
- * bam-0.1.0-38
- * get rid of some warings in bgzip.c
- * potentially improve performance in indexing for BGZF
-
-------------------------------------------------------------------------
-r690 | lh3 | 2008-11-21 21:15:51 +0000 (Fri, 21 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bgzf.c
-
-I think I have fixed the bug in bgzf
-
-------------------------------------------------------------------------
-r689 | lh3 | 2008-11-21 20:48:56 +0000 (Fri, 21 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bgzf.c
-
-bug fix by Bob
-
-------------------------------------------------------------------------
-r688 | lh3 | 2008-11-21 20:37:27 +0000 (Fri, 21 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
-
-fixed a bug due to the name change in _IOLIB
-
-------------------------------------------------------------------------
-r687 | lh3 | 2008-11-21 14:42:56 +0000 (Fri, 21 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bgzf.c
-
-fix small things
-
-------------------------------------------------------------------------
-r686 | lh3 | 2008-11-21 14:37:59 +0000 (Fri, 21 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/bgzf.c
- A /branches/prog/bam/bgzf.h
- A /branches/prog/bam/bgzip.c
-
-Bob's BGZF format, although currently buggy
-
-------------------------------------------------------------------------
-r685 | lh3 | 2008-11-21 09:48:20 +0000 (Fri, 21 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bam-0.1.0-37
- * improve interface a little bit
-
-------------------------------------------------------------------------
-r684 | lh3 | 2008-11-21 09:30:18 +0000 (Fri, 21 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bam-0.1.0-36
- * improve the interface of tview, a little bit
-
-------------------------------------------------------------------------
-r683 | lh3 | 2008-11-20 22:33:54 +0000 (Thu, 20 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam_tview.c
-
-a little better viewer
-
-------------------------------------------------------------------------
-r682 | lh3 | 2008-11-20 22:27:01 +0000 (Thu, 20 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-35
- * better viewer
-
-------------------------------------------------------------------------
-r681 | lh3 | 2008-11-20 20:51:16 +0000 (Thu, 20 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-34
- * tview is now a component of bamtk
-
-------------------------------------------------------------------------
-r680 | lh3 | 2008-11-20 19:17:30 +0000 (Thu, 20 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/bam_tview.c
-
-text alignment viewer
-
-------------------------------------------------------------------------
-r679 | lh3 | 2008-11-20 19:17:15 +0000 (Thu, 20 Nov 2008) | 5 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_lpileup.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.1
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/faidx.c
-
- * bamtk-0.1.0-33
- * added routines to reset pileup bufferes
- * fixed a bug in faidx
- * add text alignment viewer
-
-------------------------------------------------------------------------
-r678 | lh3 | 2008-11-20 11:05:02 +0000 (Thu, 20 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- A /branches/prog/bam/bam_lpileup.c (from /branches/prog/bam/bam_tview.c:668)
- D /branches/prog/bam/bam_tview.c
-
-rename tview as lpileup
-
-------------------------------------------------------------------------
-r677 | lh3 | 2008-11-20 10:08:52 +0000 (Thu, 20 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/razf.c
-
-fixed a bug in razf
-
-------------------------------------------------------------------------
-r676 | lh3 | 2008-11-19 22:52:20 +0000 (Wed, 19 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/faidx.h
-
-add documentations
-
-------------------------------------------------------------------------
-r674 | lh3 | 2008-11-19 21:39:17 +0000 (Wed, 19 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bamtk.1
- M /branches/prog/bam/faidx.h
-
-update documentation
-
-------------------------------------------------------------------------
-r673 | lh3 | 2008-11-19 21:19:03 +0000 (Wed, 19 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/bamtk.1
-
-add manual page
-
-------------------------------------------------------------------------
-r672 | lh3 | 2008-11-19 16:40:49 +0000 (Wed, 19 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/faidx.c
-
- * bamtk-0.1.0-32
- * make faidx more error resistant
-
-------------------------------------------------------------------------
-r671 | lh3 | 2008-11-19 16:09:55 +0000 (Wed, 19 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/faidx.h
-
-add index
-
-------------------------------------------------------------------------
-r670 | lh3 | 2008-11-19 16:02:39 +0000 (Wed, 19 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/faidx.c
-
- * bamtk-0.1.0-31
- * show reference sequence in pileup -v (not in the default pileup)
-
-------------------------------------------------------------------------
-r669 | lh3 | 2008-11-19 14:51:17 +0000 (Wed, 19 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/faidx.c
-
- * bamtk-0.1.0-30
- * put faidx in bamtk and remove faidx_main.c
-
-------------------------------------------------------------------------
-r668 | lh3 | 2008-11-19 14:15:05 +0000 (Wed, 19 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
- A /branches/prog/bam/faidx.c
- A /branches/prog/bam/faidx.h
- M /branches/prog/bam/razf.c
-
- * bamtk-0.1.0-29
- * fixed a bug in tview.c
- * prepare to add faidx
-
-------------------------------------------------------------------------
-r667 | lh3 | 2008-11-19 10:20:45 +0000 (Wed, 19 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/razf.c
- M /branches/prog/bam/razf.h
-
-gzip-compatible razf
-
-------------------------------------------------------------------------
-r664 | lh3 | 2008-11-18 12:50:23 +0000 (Tue, 18 Nov 2008) | 5 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-28
- * fetch: fixed a bug at an array boundary
- * fetch: fixed a bug when the whole chromosome is retrieved
- * add linear index
-
-------------------------------------------------------------------------
-r663 | lh3 | 2008-11-17 21:29:22 +0000 (Mon, 17 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-27
- * put l_qseq into core and move l_aux to bam1_t
-
-------------------------------------------------------------------------
-r662 | lh3 | 2008-11-17 20:55:16 +0000 (Mon, 17 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-26
- * save seq and qual separately
-
-------------------------------------------------------------------------
-r661 | lh3 | 2008-11-17 13:09:37 +0000 (Mon, 17 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
-
-little
-
-------------------------------------------------------------------------
-r660 | lh3 | 2008-11-17 13:06:14 +0000 (Mon, 17 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
-
-more documentations
-
-------------------------------------------------------------------------
-r659 | lh3 | 2008-11-17 12:55:08 +0000 (Mon, 17 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-25
- * make tview work for TAM
-
-------------------------------------------------------------------------
-r658 | lh3 | 2008-11-17 12:50:21 +0000 (Mon, 17 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-24
- * make tview as an independent module
-
-------------------------------------------------------------------------
-r657 | lh3 | 2008-11-17 11:26:06 +0000 (Mon, 17 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_pileup.c
-
-change little
-
-------------------------------------------------------------------------
-r656 | lh3 | 2008-11-16 21:33:19 +0000 (Sun, 16 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-23
- * also add tview for TAM
-
-------------------------------------------------------------------------
-r655 | lh3 | 2008-11-16 21:29:46 +0000 (Sun, 16 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-22
- * make tview more efficient for deep depth
-
-------------------------------------------------------------------------
-r654 | lh3 | 2008-11-16 20:52:19 +0000 (Sun, 16 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_pileup.c
- A /branches/prog/bam/bam_tview.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-21
- * fixed bug in the TAM parser: lowercase not recognized
- * unfinished function to leveled pileup (tview)
-
-------------------------------------------------------------------------
-r653 | lh3 | 2008-11-15 12:58:36 +0000 (Sat, 15 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-20
- * pileup now display deleted bases as '*'
-
-------------------------------------------------------------------------
-r652 | lh3 | 2008-11-15 09:58:39 +0000 (Sat, 15 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-19
- * fixed a bug in fetch()
- * reduce memory in indexing
-
-------------------------------------------------------------------------
-r651 | lh3 | 2008-11-14 21:56:05 +0000 (Fri, 14 Nov 2008) | 5 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-18
- * important changes are made to index: the index size is increased, but
- now we have no limit on file sizes and the new method potentially
- works with BGZF, Bob's new compression format.
-
-------------------------------------------------------------------------
-r650 | lh3 | 2008-11-14 16:03:22 +0000 (Fri, 14 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-17
- * more comments in bam.h
- * fixed a bug in bam_index.c
-
-------------------------------------------------------------------------
-r649 | lh3 | 2008-11-13 16:04:18 +0000 (Thu, 13 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bam_sort.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-16
- * use macros to retrieve pointers from bam1_t and thus reduce the size
- of bam1_t struct.
-
-------------------------------------------------------------------------
-r648 | lh3 | 2008-11-13 13:21:39 +0000 (Thu, 13 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_sort.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-15
- * make more things work over pipe
-
-------------------------------------------------------------------------
-r647 | lh3 | 2008-11-13 12:49:28 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/misc/maq2tam.c
-
-fixed a bug in maq2tam
-
-------------------------------------------------------------------------
-r646 | lh3 | 2008-11-13 11:46:59 +0000 (Thu, 13 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/Makefile
- M /branches/prog/bam/misc/Makefile
- M /branches/prog/bam/misc/maq2tam.c
-
- * bug fix in maq2tam.c
- * improve Makefile
-
-------------------------------------------------------------------------
-r645 | lh3 | 2008-11-13 11:39:46 +0000 (Thu, 13 Nov 2008) | 3 lines
-Changed paths:
- A /branches/prog/bam/misc/Makefile
- M /branches/prog/bam/misc/maq2tam.c
-
- * corrected maq2tam
- * add Makefile
-
-------------------------------------------------------------------------
-r644 | lh3 | 2008-11-13 11:25:45 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/razf.c
-
-fixed the bug in buffered write (on behalf of Jue)
-
-------------------------------------------------------------------------
-r643 | lh3 | 2008-11-13 10:53:42 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- D /branches/prog/bam/all2tam.pl
- A /branches/prog/bam/misc/all2tam.pl (from /branches/prog/bam/all2tam.pl:642)
-
-move to misc
-
-------------------------------------------------------------------------
-r642 | lh3 | 2008-11-13 10:53:23 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/all2tam.pl
-
-change tag
-
-------------------------------------------------------------------------
-r641 | lh3 | 2008-11-13 10:53:12 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- D /branches/prog/bam/utils
-
-has been renamed
-
-------------------------------------------------------------------------
-r640 | lh3 | 2008-11-13 10:52:50 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/misc (from /branches/prog/bam/utils:639)
-
-rename
-
-------------------------------------------------------------------------
-r639 | lh3 | 2008-11-13 10:52:35 +0000 (Thu, 13 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam/utils
- A /branches/prog/bam/utils/maq2tam.c
-
-utilities (converters and so on)
-
-------------------------------------------------------------------------
-r638 | lh3 | 2008-11-12 22:24:22 +0000 (Wed, 12 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-14
- * copy the text header to BAM
- * add BAM1 header flag
-
-------------------------------------------------------------------------
-r637 | lh3 | 2008-11-12 14:56:08 +0000 (Wed, 12 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/razf.c
-
- * bamtk-0.1.0-13
- * fixed a bug in razf
- * improved and fixed potential bugs in index
-
-------------------------------------------------------------------------
-r636 | lh3 | 2008-11-12 11:57:13 +0000 (Wed, 12 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
-update documentation in the HeaderDOC format
-
-------------------------------------------------------------------------
-r635 | lh3 | 2008-11-12 10:08:38 +0000 (Wed, 12 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-12
- * more documentations
- * rename baf1_core_t as bam1_core_t
-
-------------------------------------------------------------------------
-r634 | lh3 | 2008-11-11 23:00:35 +0000 (Tue, 11 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_pileup.c
-
-documentation
-
-------------------------------------------------------------------------
-r633 | lh3 | 2008-11-11 21:23:49 +0000 (Tue, 11 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-11
- * give up regional pileup. We can now use pipe to mimic that.
- * for index file, change suffix .idx to .bmi
-
-------------------------------------------------------------------------
-r632 | lh3 | 2008-11-11 21:00:11 +0000 (Tue, 11 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/razf.c
-
- * bamtk-0.1.0-10
- * make pileup work on TAM
-
-------------------------------------------------------------------------
-r631 | lh3 | 2008-11-11 09:20:29 +0000 (Tue, 11 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/razf.c
- M /branches/prog/bam/razf.h
- M /branches/prog/bam/razip.c
-
- * bamtk-0.1.0-9
- * razf now supports streaming
- * prepare to improve pileup (have not yet)
-
-------------------------------------------------------------------------
-r630 | lh3 | 2008-11-10 18:34:40 +0000 (Mon, 10 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-8
- * improve the interface of TAM parser
-
-------------------------------------------------------------------------
-r629 | lh3 | 2008-11-10 13:06:13 +0000 (Mon, 10 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-7
- * almost nothing
-
-------------------------------------------------------------------------
-r628 | lh3 | 2008-11-10 12:56:36 +0000 (Mon, 10 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-6
- * fixed a bug in bam_pileup.c
-
-------------------------------------------------------------------------
-r627 | lh3 | 2008-11-10 11:32:46 +0000 (Mon, 10 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bamtk.c
- M /branches/prog/bam/razf.c
-
- * bamtk-0.1.0-5
- * fixed a bug in razf.c, caused by my modifications
- * improve the interface of pileup. Now it will be slower but more flexible
-
-------------------------------------------------------------------------
-r626 | lh3 | 2008-11-09 20:51:04 +0000 (Sun, 09 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-4
- * view: dumping binary output
-
-------------------------------------------------------------------------
-r625 | lh3 | 2008-11-09 20:31:54 +0000 (Sun, 09 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bam_pileup.c
- M /branches/prog/bam/bam_sort.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-3
- * rename functions
-
-------------------------------------------------------------------------
-r624 | lh3 | 2008-11-09 15:07:32 +0000 (Sun, 09 Nov 2008) | 2 lines
-Changed paths:
- M /branches/prog/bam/bam.h
-
-add comments
-
-------------------------------------------------------------------------
-r623 | lh3 | 2008-11-08 22:32:49 +0000 (Sat, 08 Nov 2008) | 4 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-2
- * improve indexing for a mixture of long and short reads, although currently
- I do not know whether it really works...
-
-------------------------------------------------------------------------
-r622 | lh3 | 2008-11-08 22:13:58 +0000 (Sat, 08 Nov 2008) | 3 lines
-Changed paths:
- M /branches/prog/bam/bam_index.c
- M /branches/prog/bam/bamtk.c
-
- * bamtk-0.1.0-1
- * prepare for improving indexing algorithm
-
-------------------------------------------------------------------------
-r621 | lh3 | 2008-11-08 20:28:09 +0000 (Sat, 08 Nov 2008) | 4 lines
-Changed paths:
- A /branches/prog/bam/all2tam.pl
- M /branches/prog/bam/bam.c
- M /branches/prog/bam/bam.h
- M /branches/prog/bam/bam_import.c
- M /branches/prog/bam/bamtk.c
- D /branches/prog/bam/tam_utils.pl
-
- * bamtk-0.1.0
- * smarter integers
- * rename tam_utils.pl to all2tam.pl
-
-------------------------------------------------------------------------
-r620 | lh3 | 2008-11-08 17:17:22 +0000 (Sat, 08 Nov 2008) | 2 lines
-Changed paths:
- A /branches/prog/bam
- A /branches/prog/bam/Makefile
- A /branches/prog/bam/bam.c
- A /branches/prog/bam/bam.h
- A /branches/prog/bam/bam_endian.h
- A /branches/prog/bam/bam_import.c
- A /branches/prog/bam/bam_index.c
- A /branches/prog/bam/bam_pileup.c
- A /branches/prog/bam/bam_sort.c
- A /branches/prog/bam/bamtk.c
- A /branches/prog/bam/khash.h
- A /branches/prog/bam/ksort.h
- A /branches/prog/bam/kstream.h
- A /branches/prog/bam/razf.c
- A /branches/prog/bam/razf.h
- A /branches/prog/bam/razip.c
- A /branches/prog/bam/tam_utils.pl
- A /branches/prog/bam/zutil.h
-
-The Binary Alignment/Mapping format.
-
-------------------------------------------------------------------------
diff --git a/sam/INSTALL b/sam/INSTALL
deleted file mode 100644
index 37d84a9..0000000
--- a/sam/INSTALL
+++ /dev/null
@@ -1,30 +0,0 @@
-System Requirements
-===================
-
-SAMtools depends on the zlib library <http://www.zlib.net>. Version 1.2.3+ is
-preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA
-file. SAMtools' faidx is able to index a razip-compressed FASTA file to save
-diskspace. Older zlib also works with SAMtools, but razip cannot be compiled.
-
-The text-based viewer (tview) requires the GNU ncurses library
-<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and most of
-the modern Linux/Unix distributions. If you do not have this library installed,
-you can still compile the rest of SAMtools by manually changing:
-`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and
-comment out the line starting with `LIBCURSES='.
-
-
-Compilation
-===========
-
-Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile
-razip with `make razip'.
-
-
-Installation
-============
-
-Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to
-a location you want (e.g. a directory in your $PATH). You may also copy
-`samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such
-that the `man' command may find the manual.
diff --git a/sam/Makefile.mingw b/sam/Makefile.mingw
deleted file mode 100644
index 7a57ffc..0000000
--- a/sam/Makefile.mingw
+++ /dev/null
@@ -1,63 +0,0 @@
-CC= gcc.exe
-AR= ar.exe
-CFLAGS= -g -Wall -O2
-DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2
-KNETFILE_O= knetfile.o
-LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
- bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \
- $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o
-AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
- bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
- bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
- cut_target.o phase.o bam_cat.o bam2depth.o
-BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \
- bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \
- bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o
-PROG= samtools.exe bcftools.exe
-INCLUDES= -I. -Iwin32
-SUBDIRS= .
-LIBPATH=
-
-.SUFFIXES:.c .o
-
-.c.o:
- $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
-
-all:$(PROG)
-
-.PHONY:all lib clean cleanlocal
-.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
-
-lib:libbam.a
-
-libbam.a:$(LOBJS)
- $(AR) -cru $@ $(LOBJS)
-
-samtools.exe:$(AOBJS) libbam.a $(BCFOBJS)
- $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32
-
-bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o
- $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32
-
-razip.o:razf.h
-bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
-sam.o:sam.h bam.h
-bam_import.o:bam.h kseq.h khash.h razf.h
-bam_pileup.o:bam.h razf.h ksort.h
-bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h
-bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
-bam_lpileup.o:bam.h ksort.h
-bam_tview.o:bam.h faidx.h
-bam_sort.o:bam.h ksort.h razf.h
-bam_md.o:bam.h faidx.h
-sam_header.o:sam_header.h khash.h
-bcf.o:bcftools/bcf.h
-bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
-bam2bcf_indel.o:bam2bcf.h
-errmod.o:errmod.h
-
-faidx.o:faidx.h razf.h khash.h
-faidx_main.o:faidx.h razf.h
-
-clean:
- rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib
diff --git a/sam/NEWS b/sam/NEWS
deleted file mode 100644
index 121485e..0000000
--- a/sam/NEWS
+++ /dev/null
@@ -1,836 +0,0 @@
-Beta Release 0.1.19 (15 March, 2013)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes in samtools and bcftools:
-
- * The latest source code and development moved to github,
- http://github.com/samtools/samtools
-
- * Many important bugfixes and contributions by many people. Thanks to all!
-
- * Performance improvements (multi-threading)
-
- * Important changes in calling, see
- - samtools mpileup -p
- - bcftools view -m
-
- * New annotations useful for filtering (RPB, HWE, QBD, MDV)
-
- * New tools, bamcheck and plot-bamcheck
-
- * New features in samtools tview
-
- * And much more..
-
-For a detailed list of commits, please see
-http://github.com/samtools/samtools/commits/master
-
-(0.1.19: 15 March 2013, commit 96b5f2294ac0054230e88913c4983d548069ea4e)
-
-
-Beta Release 0.1.18 (2 September, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes in samtools:
-
- * Support the new =/X CIGAR operators (by Peter Cock).
-
- * Allow to subsample BAM while keeping the pairing intact (view -s).
-
- * Implemented variant distance bias as a new filter (by Petr Danecek).
-
- * Bugfix: huge memory usage during indexing
-
- * Bugfix: use of uninitialized variable in mpileup (rare)
-
- * Bugfix: wrong BAQ probability (rare)
-
-Notable changes in bcftools:
-
- * Support indel in the contrast caller.
-
- * Bugfix: LRT2=nan in rare cases
-
-(0.1.18: 2 September 2011, r982:295)
-
-
-
-Beta Release 0.1.17 (6 July, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-With the maturity of `mpileup' and the lack of update in the `pileup' command,
-the `pileup' command is now formally dropped. Most of the pileup functionality,
-such as outputting mapping quality and read positions, have been added
-`mpileup'.
-
-Since this release, `bcftools view' is able to perform contrast SNP calling
-(option -T) for discovering de novo and/or somatic mutations between a pair of
-samples or in a family trio. Potential mutations are scored by a log likelihood
-ratio, which is very simple in math, but should be comparable to more
-sophisticated methods. Note that getting the score is only the very first step.
-A lot more need to be done to reduce systematical errors due to mapping and
-reference errors and structural variations.
-
-Other notable changes in samtools:
-
- * Improved sorting order checking during indexing.
-
- * Improved region parsing. Colons in reference sequence names are parsed
- properly.
-
- * Fixed an issue where mpileup does not apply BAQ for the first few reads when
- a region is specified.
-
- * Fixed an issue where `faidx' does not work with FASTA files with long lines.
-
- * Bugfix: wrong SP genotype information in the BCF output.
-
-Other notable changes in bcftools:
-
- * Output the ML esitmate of the allele count.
-
- * Added the HWE plus F<0 filter to varFilter. For multiple samples, it
- effectively filters false heterozygous calls around centromeres.
-
- * For association mapping, perform both 1-degree and 2-degree test. The
- 2-degree test is conservative but more robust to HWE violation.
-
-(0.1.17: 6 July 2011, r973:277)
-
-
-
-Beta Release 0.1.16 (21 April, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes in samtools:
-
- * Support the new SAM/BAM type `B' in the latest SAM spec v1.4.
-
- * When the output file of `samtools merge' exists, do not overwrite it unless
- a new command-line option `-f' is applied.
-
- * Bugfix: BED support is not working when the input BED is not sorted.
-
- * Bugfix: some reads without coordinates but given on the reverse strand are
- lost in merging.
-
-Notable changes in bcftools:
-
- * Code cleanup: separated max-likelihood inference and Bayesian inference.
-
- * Test Hardy-Weinberg equilibrium with a likelihood-ratio test.
-
- * Provided another association test P-value by likelihood-ratio test.
-
- * Use Brent's method to estimate the site allele frequency when EM converges
- slowly. The resulting ML estimate of allele frequnecy is more accurate.
-
- * Added the `ldpair' command, which computes r^2 between SNP pairs given in
- an input file.
-
-Also, the `pileup' command, which has been deprecated by `mpileup' since
-version 0.1.10, will be dropped in the next release. The old `pileup' command
-is substandard and causing a lot of confusion.
-
-(0.1.16: 21 April 2011, r963:234)
-
-
-
-Beta Release 0.1.15 (10 April, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Noteable changes:
-
- * Allow to perform variant calling or to extract information in multiple
- regions specified by a BED file (`samtools mpileup -l', `samtools view -L'
- and `bcftools view -l').
-
- * Added the `depth' command to samtools to compute the per-base depth with a
- simpler interface. File `bam2depth.c', which implements this command, is the
- recommended example on how to use the mpileup APIs.
-
- * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg
- test using this estimate.
-
- * For `samtools view', when `-R' is specified, drop read groups in the header
- that are not contained in the specified file.
-
- * For `samtools flagstat', separate QC-pass and QC-fail reads.
-
- * Improved the command line help of `samtools mpileup' and `bcftools view'.
-
- * Use a global variable to control the verbose level of samtools stderr
- output. Nonetheless, it has not been full utilized.
-
- * Fixed an issue in association test which may report false associations,
- possibly due to floating point underflow.
-
-(0.1.15: 10 April 2011, r949:203)
-
-
-
-Beta release 0.1.14 (21 March, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This release implements a method for testing associations for case-control
-data. The method does not call genotypes but instead sums over all genotype
-configurations to compute a chi^2 based test statistics. It can be potentially
-applied to comparing a pair of samples (e.g. a tumor-normal pair), but this
-has not been evaluated on real data.
-
-Another new feature is to make X chromosome variant calls when female and male
-samples are both present. The user needs to provide a file indicating the
-ploidy of each sample (see also manual bcftools/bcftools.1).
-
-Other notable changes:
-
- * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
- older which encodes PL in a different way.
-
- * Changed the behavior of `bcftools view -s'. Now when a list of samples is
- provided, the samples in the output will be reordered to match the ordering
- in the sample list. This change is mainly designed for association test.
-
- * Sped up `bcftools view -v' for target sequencing given thousands of samples.
- Also added a new option `view -d' to skip loci where only a few samples are
- covered by reads.
-
- * Dropped HWE test. This feature has never been implemented properly. An EM
- should be much better. To be implemented in future.
-
- * Added the `cat' command to samtools. This command concatenate BAMs with
- identical sequence dictionaries in an efficient way. Modified from bam_cat.c
- written by Chris Saunders.
-
- * Added `samtools view -1' to write BAMs at a low compression level but twice
- faster to create. The `sort' command generates temporary files at a low
- compression level as well.
-
- * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality
- strings (strictly speaking, such a file is not BAM).
-
- * Added `samtools mpileup -L' to skip INDEL calling in regions with
- excessively high coverage. Such regions dramatically slow down mpileup.
-
- * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc.
-
-(0.1.14: 21 March 2011, r933:170)
-
-
-
-Beta release 0.1.13 (1 March, 2011)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The most important though largely invisible modification is the change of the
-order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF
-spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF
-generated by samtools older than r921 inclusive. VCF/BCF generated by the new
-samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools
-version number.
-
-Single Individual Haplotyping (SIH) is added as an experimental feature. It
-originally aims to produce haploid consensus from fosmid pool sequencing, but
-also works with short-read data. For short reads, phased blocks are usually too
-short to be useful in many applications, but they can help to rule out part of
-SNPs close to INDELs or between copies of CNVs.
-
-
-Other notable changes in samtools:
-
- * Construct per-sample consensus to reduce the effect of nearby SNPs in INDEL
- calling. This reduces the power but improves specificity.
-
- * Improved sorting order checking in indexing. Now indexing is the preferred way
- to check if a BAM is sorted.
-
- * Added a switch `-E' to mpileup and calmd. This option uses an alternative way
- to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of
- a little loss in specificity.
-
- * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling.
-
- * Added `mpileup -m' to allow fine control of the collection of INDEL candidates.
-
- * Added `mpileup -S' to compute per-sample strand bias P-value.
-
- * Added `mpileup -G' to exclude read groups in variant calling.
-
- * Fixed segfault in indel calling related to unmapped and refskip reads.
-
- * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL
- genotypes for longer short INDELs, typically over 10bp.
-
- * Fixed a bug in tview on big-endian machines.
-
- * Fixed a very rare memory issue in bam_md.c
-
- * Fixed an out-of-boundary bug in mpileup when the read base is `N'.
-
- * Fixed a compiling error when the knetfile library is not used. Fixed a
- library compiling error due to the lack of bam_nt16_nt4_table[] table.
- Suppress a compiling warning related to the latest zlib.
-
-
-Other notable changes in bcftools:
-
- * Updated the BCF spec.
-
- * Added the `FQ' VCF INFO field, which gives the phred-scaled probability
- of all samples being the same (identical to the reference or all homozygous
- variants). Option `view -f' has been dropped.
-
- * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence
- similar to "samtools.pl pileup2fq".
-
- * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF
- spec. Drop bcf-fix.pl.
-
- * Output bcftools specific INFO and FORMAT in the VCF header.
-
- * Added `view -s' to call variants from a subset of samples.
-
- * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless,
- custom fields are still unparsed and will be stored as a missing value.
-
- * Fixed a minor bug in Fisher's exact test; the results are rarely changed.
-
-
-(0.1.13: 1 March 2011, r926:134)
-
-
-
-Beta release 0.1.12a (2 December, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This is another bug fix release:
-
- * Fixed a memory violation in mpileup, which causes segfault. Release
- 0.1.9 and above are affected.
-
- * Fixed a memory violation in the indel caller, which does not causes
- segfault, but may potentially affect deletion calls in an unexpected
- way. Release 0.1.10 and above are affected.
-
- * Fixed a bug in computing r-square in bcftools. Few are using this
- functionality and it only has minor effect.
-
- * Fixed a memory leak in bam_fetch().
-
- * Fixed a bug in writing meta information to the BAM index for the last
- sequence. This bug is invisible to most users, but it is a bug anyway.
-
- * Fixed a bug in bcftools which causes false "DP4=0,0,0,0" annotations.
-
-(0.1.12: 2 December 2010, r862)
-
-
-
-Beta release 0.1.11 (21 November, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This is mainly a bug fix release:
-
- * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads
- are retrieved from a small region containing no reads.
-
- * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion
- failure when the first CIGAR operation is a deletion.
-
- * Improved fault tolerence in remote access.
-
-One minor feature has been implemented in bcftools:
-
- * Added a reference-free variant calling mode. In this mode, a site is
- regarded as a variat iff the sample(s) contains two or more alleles;
- the meaning of the QUAL field in the VCF output is changed
- accordingly. Effectively, the reference allele is irrelevant to the
- result in the new mode, although the reference sequence has to be
- used in realignment when SAMtools computes genotype likelihoods.
-
-In addition, since 0.1.10, the `pileup' command has been deprecated by
-`mpileup' which is more powerful and more accurate. The `pileup' command
-will not be removed in the next few releases, but new features will not
-be added.
-
-(0.1.11: 21 November 2010, r851)
-
-
-
-Beta Release 0.1.10 (16 November, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This release is featured as the first major improvement to the indel
-caller. The method is similar to the old one implemented in the pileup
-command, but the details are handled more carefully both in theory and
-in practice. As a result, the new indel caller usually gives more
-accurate indel calls, though at the cost of sensitivity. The caller is
-implemented in the mpileup command and is invoked by default. It works
-with multiple samples.
-
-Other notable changes:
-
- * With the -r option, the calmd command writes the difference between
- the original base quality and the BAQ capped base quality at the BQ
- tag but does not modify the base quality. Please use -Ar to overwrite
- the original base quality (the 0.1.9 behavior).
-
- * Allow to set a maximum per-sample read depth to reduce memory. In
- 0.1.9, most of memory is wasted for the ultra high read depth in some
- regions (e.g. the chr1 centromere).
-
- * Optionally write per-sample read depth and per-sample strand bias
- P-value.
-
- * Compute equal-tail (Bayesian) credible interval of site allele
- frequency at the CI95 VCF annotation.
-
- * Merged the vcfutils.pl varFilter and filter4vcf for better SNP/indel
- filtering.
-
-(0.1.10: 16 November 2010, r829)
-
-
-
-Beta Release 0.1.9 (27 October, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This release is featured as the first major improvement to the samtools'
-SNP caller. It comes with a revised MAQ error model, the support of
-multi-sample SNP calling and the computation of base alignment quality
-(BAQ).
-
-The revised MAQ error model is based on the original model. It solves an
-issue of miscalling SNPs in repetitive regions. Althought such SNPs can
-usually be filtered at a later step, they mess up unfiltered calls. This
-is a theoretical flaw in the original model. The revised MAQ model
-deprecates the orginal MAQ model and the simplified SOAPsnp model.
-
-Multi-sample SNP calling is separated in two steps. The first is done by
-samtools mpileup and the second by a new program, bcftools, which is
-included in the samtools source code tree. Multi-sample SNP calling also
-works for single sample and has the advantage of enabling more powerful
-filtration. It is likely to deprecate pileup in future once a proper
-indel calling method is implemented.
-
-BAQ is the Phred-scaled probability of a read base being wrongly
-aligned. Capping base quality by BAQ has been shown to be very effective
-in suppressing false SNPs caused by misalignments around indels or in
-low-complexity regions with acceptable compromise on computation
-time. This strategy is highly recommended and can be used with other SNP
-callers as well.
-
-In addition to the three major improvements, other notable changes are:
-
- * Changes to the pileup format. A reference skip (the N CIGAR operator)
- is shown as '<' or '>' depending on the strand. Tview is also changed
- accordingly.
-
- * Accelerated pileup. The plain pileup is about 50% faster.
-
- * Regional merge. The merge command now accepts a new option to merge
- files in a specified region.
-
- * Fixed a bug in bgzip and razip which causes source files to be
- deleted even if option -c is applied.
-
- * In APIs, propogate errors to downstream callers and make samtools
- return non-zero values once errors occur.
-
-(0.1.9: 27 October 2010, r783)
-
-
-
-Beta Release 0.1.8 (11 July, 2010)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable functional changes:
-
- * Added the `reheader' command which replaces a BAM header with a new
- header. This command is much faster than replacing header by
- BAM->SAM->BAM conversions.
-
- * Added the `mpileup' command which computes the pileup of multiple
- alignments.
-
- * The `index' command now stores the number of mapped and unmapped
- reads in the index file. This information can be retrieved quickly by
- the new `idxstats' command.
-
- * By default, pileup used the SOAPsnp model for SNP calling. This
- avoids the floating overflow in the MAQ model which leads to spurious
- calls in repetitive regions, although these calls will be immediately
- filtered by varFilter.
-
- * The `tview' command now correctly handles CIGARs like 7I10M and
- 10M1P1I10M which cause assertion failure in earlier versions.
-
- * Tview accepts a region like `=10,000' where `=' stands for the
- current sequence name. This saves typing for long sequence names.
-
- * Added the `-d' option to `pileup' which avoids slow indel calling
- in ultradeep regions by subsampling reads locally.
-
- * Added the `-R' option to `view' which retrieves alignments in read
- groups listed in the specified file.
-
-Performance improvements:
-
- * The BAM->SAM conversion is up to twice faster, depending on the
- characteristic of the input.
-
- * Parsing SAM headers with a lot of reference sequences is now much
- faster.
-
- * The number of lseek() calls per query is reduced when the query
- region contains no read alignments.
-
-Bug fixes:
-
- * Fixed an issue in the indel caller that leads to miscall of indels.
- Note that this solution may not work well when the sequencing indel
- error rate is higher than the rate of SNPs.
-
- * Fixed another issue in the indel caller which may lead to incorrect
- genotype.
-
- * Fixed a bug in `sort' when option `-o' is applied.
-
- * Fixed a bug in `view -r'.
-
-APIs and other changes:
-
- * Added iterator interfaces to random access and pileup. The callback
- interfaces directly call the iterator interfaces.
-
- * The BGZF blocks holding the BAM header are indepedent of alignment
- BGZF blocks. Alignment records shorter than 64kB is guaranteed to be
- fully contained in one BGZF block. This change is fully compatible
- with the old version of samtools/picard.
-
-Changes in other utilities:
-
- * Updated export2sam.pl by Chris Saunders.
-
- * Improved the sam2vcf.pl script.
-
- * Added a Python version of varfilter.py by Aylwyn Scally.
-
-(0.1.8: 11 July 2010, r613)
-
-
-
-Beta Release 0.1.7 (10 November, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Improved the indel caller in complex scenariors, in particular for
- long reads. The indel caller is now able to make reasonable indel
- calls from Craig Venter capillary reads.
-
- * Rewrote single-end duplicate removal with improved
- performance. Paired-end reads are not touched.
-
- * Duplicate removal is now library aware. Samtools remove potential
- PCR/optical dupliates inside a library rather than across libraries.
-
- * SAM header is now fully parsed, although this functionality is not
- used in merging and so on.
-
- * In samtools merge, optionally take the input file name as RG-ID and
- attach the RG tag to each alignment.
-
- * Added FTP support in the RAZF library. RAZF-compressed reference
- sequence can be retrieved remotely.
-
- * Improved network support for Win32.
-
- * Samtools sort and merge are now stable.
-
-Changes in other utilities:
-
- * Implemented sam2vcf.pl that converts the pileup format to the VCF
- format.
-
- * This release of samtools is known to work with the latest
- Bio-Samtools Perl module.
-
-(0.1.7: 10 November 2009, r510)
-
-
-
-Beta Release 0.1.6 (2 September, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * In tview, do not show a blank screen when no reads mapped to the
- corresponding region.
-
- * Implemented native HTTP support in the BGZF library. Samtools is now
- able to directly open a BAM file on HTTP. HTTP proxy is also
- supported via the "http_proxy" environmental variable.
-
- * Samtools is now compitable with the MinGW (win32) compiler and the
- PDCurses library.
-
- * The calmd (or fillmd) command now calculates the NM tag and replaces
- MD tags if they are wrong.
-
- * The view command now recognizes and optionally prints FLAG in HEXs or
- strings to make a SAM file more friendly to human eyes. This is a
- samtools-C extension, not implemented in Picard for the time
- being. Please type `samtools view -?' for more information.
-
- * BAM files now have an end-of-file (EOF) marker to facilitate
- truncation detection. A warning will be given if an on-disk BAM file
- does not have this marker. The warning will be seen on BAM files
- generated by an older version of samtools. It does NO harm.
-
- * New key bindings in tview: `r' to show read names and `s' to show
- reference skip (N operation) as deletions.
-
- * Fixed a bug in `samtools merge -n'.
-
- * Samtools merge now optionally copies the header of a user specified
- SAM file to the resultant BAM output.
-
- * Samtools pileup/tview works with a CIGAR with the first or the last
- operation is an indel.
-
- * Fixed a bug in bam_aux_get().
-
-
-Changes in other utilies:
-
- * Fixed wrong FLAG in maq2sam.
-
-
-(0.1.6: 2 September 2009, r453)
-
-
-
-Beta Release 0.1.5 (7 July, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Support opening a BAM alignment on FTP. Users can now use "tview" to
- view alignments at the NCBI ftp site. Please read manual for more
- information.
-
- * In library, propagate errors rather than exit or complain assertion
- failure.
-
- * Simplified the building system and fixed compiling errors caused by
- zlib<1.2.2.1.
-
- * Fixed an issue about lost header information when a SAM is imported
- with "view -t".
-
- * Implemented "samtool.pl varFilter" which filters both SNPs and short
- indels. This command replaces "indelFilter".
-
- * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from
- pileup output.
-
- * In pileup, cap mapping quality at 60. This helps filtering when
- different aligners are in use.
-
- * In pileup, allow to output variant sites only.
-
- * Made pileup generate correct calls in repetitive region. At the same
- time, I am considering to implement a simplified model in SOAPsnp,
- although this has not happened yet.
-
- * In view, added '-u' option to output BAM without compression. This
- option is preferred when the output is piped to other commands.
-
- * In view, added '-l' and '-r' to get the alignments for one library or
- read group. The "@RG" header lines are now partially parsed.
-
- * Do not include command line utilities to libbam.a.
-
- * Fixed memory leaks in pileup and bam_view1().
-
- * Made faidx more tolerant to empty lines right before or after FASTA >
- lines.
-
-
-Changes in other utilities:
-
- * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign.
-
-
-This release involves several modifications to the key code base which
-may potentially introduce new bugs even though we have tried to minimize
-this by testing on several examples. Please let us know if you catch
-bugs.
-
-(0.1.5: 7 July 2009, r373)
-
-
-
-Beta Release 0.1.4 (21 May, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes:
-
- * Added the 'rmdupse' command: removing duplicates for SE reads.
-
- * Fixed a critical bug in the indel caller: clipped alignments are not
- processed correctly.
-
- * Fixed a bug in the tview: gapped alignment may be incorrectly
- displayed.
-
- * Unified the interface to BAM and SAM I/O. This is done by
- implementing a wrapper on top of the old APIs and therefore old APIs
- are still valid. The new I/O APIs also recognize the @SQ header
- lines.
-
- * Generate the MD tag.
-
- * Generate "=" bases. However, the indel caller will not work when "="
- bases are present.
-
- * Enhanced support of color-read display (by Nils Homer).
-
- * Implemented the GNU building system. However, currently the building
- system does not generate libbam.a. We will improve this later. For
- the time being, `make -f Makefile.generic' is preferred.
-
- * Fixed a minor bug in pileup: the first read in a chromosome may be
- skipped.
-
- * Fixed bugs in bam_aux.c. These bugs do not affect other components as
- they were not used previously.
-
- * Output the 'SM' tag from maq2sam.
-
-(0.1.4: 21 May 2009, r297)
-
-
-
-Beta Release 0.1.3 (15 April, 2009)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes in SAMtools:
-
- * SAMtools is more consistent with the specification: a) '*' in the
- QUAL field is allowed; b) the field separator is TAB only and SPACE
- is treated as a character in a field; c) empty header is allowed.
-
- * Implemented GLFv3 support in pileup.
-
- * Fixed a severe bug in fixmate: strand information is wrongly
- overwritten.
-
- * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are
- not correctly retrieved sometimes.
-
- * Fixed a bug in rmdup: segfault if unmapped reads are present.
-
- * Move indel_filter.pl to samtools.pl and improved the filtering by
- checking the actual number of alignments containing indels. The indel
- pileup line is also changed a little to make this filtration easier.
-
- * Fixed a minor bug in indexing: the bin number of an unmapped read is
- wrongly calculated.
-
- * Added `flagstat' command to show statistics on the FLAG field.
-
- * Improved indel caller by setting the maximum window size in local
- realignment.
-
-Changes in other utilities:
-
- * Fixed a bug in maq2sam: a tag name is obsolete.
-
- * Improvement to wgsim: a) added support for SOLiD read simulation; b)
- show the number of substitutions/indels/errors in read name; c)
- considerable code clean up.
-
- * Various converters: improved functionality in general.
-
- * Updated the example SAM due to the previous bug in fixmate.
-
-(0.1.3: 15 April 2009, r227)
-
-
-
-Beta Release 0.1.2 (28 January, 2008)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Notable changes in SAMtools:
-
- * Implemented a Bayesian indel caller. The new caller generate scores
- and genotype and is potentially more accurate than Maq's indel
- caller. The pileup format is also changed accordingly.
-
- * Implemented rmdup command: remove potential PCR duplicates. Note that
- this command ONLY works for FR orientation and requires ISIZE is
- correctly set.
-
- * Added fixmate command: fill in mate coordinates, ISIZE and mate
- related flags from a name-sorted alignment.
-
- * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved.
-
- * Allow to select reads shown in the pileup output with a mask.
-
- * Generate GLFv2 from pileup.
-
- * Added two more flags for flagging PCR/optical duplicates and for QC
- failure.
-
- * Fixed a bug in sort command: name sorting for large alignment did not
- work.
-
- * Allow to completely disable RAZF (using Makefile.lite) as some people
- have problem to compile it.
-
- * Fixed a bug in import command when there are reads without
- coordinates.
-
- * Fixed a bug in tview: clipping broke the alignment viewer.
-
- * Fixed a compiling error when _NO_CURSES is applied.
-
- * Fixed a bug in merge command.
-
-Changes in other utilities:
-
- * Added wgsim, a paired-end reads simulator. Wgsim was adapted from
- maq's reads simulator. Colin Hercus further improved it to allow
- longer indels.
-
- * Added wgsim_eval.pl, a script that evaluates the accuracy of
- alignment on reads generated by wgsim.
-
- * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not
- work properly when multiple hits are output.
-
- * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will
- be retained when multiple hits are present.
-
- * Fixed a bug in export2sam.pl for QC reads.
-
- * Support RG tag at MAQ->SAM converter.
-
- * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and
- indel are not properly handled, though.
-
- * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the
- default Illumina output.
-
-(0.1.2: 28 January 2008; r116)
-
-
-
-Beta Release 0.1.1 (22 December, 2008)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The is the first public release of samtools. For more information,
-please check the manual page `samtools.1' and the samtools website
-http://samtools.sourceforge.net
diff --git a/sam/bam.c b/sam/bam.c
deleted file mode 100644
index b00d6a6..0000000
--- a/sam/bam.c
+++ /dev/null
@@ -1,474 +0,0 @@
-#include <stdio.h>
-#include <ctype.h>
-#include <errno.h>
-#include <assert.h>
-#include "bam.h"
-#include "bam_endian.h"
-#include "kstring.h"
-#include "sam_header.h"
-
-int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0;
-char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
-
-/**************************
- * CIGAR related routines *
- **************************/
-
-uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
-{
- int k, end = c->pos;
- for (k = 0; k < c->n_cigar; ++k) {
- int op = bam_cigar_op(cigar[k]);
- int len = bam_cigar_oplen(cigar[k]);
- if (op == BAM_CBACK) { // move backward
- int l, u, v;
- if (k == c->n_cigar - 1) break; // skip trailing 'B'
- for (l = k - 1, u = v = 0; l >= 0; --l) {
- int op1 = bam_cigar_op(cigar[l]);
- int len1 = bam_cigar_oplen(cigar[l]);
- if (bam_cigar_type(op1)&1) { // consume query
- if (u + len1 >= len) { // stop
- if (bam_cigar_type(op1)&2) v += len - u;
- break;
- } else u += len1;
- }
- if (bam_cigar_type(op1)&2) v += len1;
- }
- end = l < 0? c->pos : end - v;
- } else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]);
- }
- return end;
-}
-
-int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
-{
- uint32_t k;
- int32_t l = 0;
- for (k = 0; k < c->n_cigar; ++k)
- if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
- l += bam_cigar_oplen(cigar[k]);
- return l;
-}
-
-/********************
- * BAM I/O routines *
- ********************/
-
-bam_header_t *bam_header_init()
-{
- bam_is_be = bam_is_big_endian();
- return (bam_header_t*)calloc(1, sizeof(bam_header_t));
-}
-
-void bam_header_destroy(bam_header_t *header)
-{
- int32_t i;
- extern void bam_destroy_header_hash(bam_header_t *header);
- if (header == 0) return;
- if (header->target_name) {
- for (i = 0; i < header->n_targets; ++i)
- free(header->target_name[i]);
- free(header->target_name);
- free(header->target_len);
- }
- free(header->text);
- if (header->dict) sam_header_free(header->dict);
- if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
- bam_destroy_header_hash(header);
- free(header);
-}
-
-bam_header_t *bam_header_read(bamFile fp)
-{
- bam_header_t *header;
- char buf[4];
- int magic_len;
- int32_t i = 1, name_len;
- // check EOF
- i = bgzf_check_EOF(fp);
- if (i < 0) {
- // If the file is a pipe, checking the EOF marker will *always* fail
- // with ESPIPE. Suppress the error message in this case.
- if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
- }
- else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
- // read "BAM1"
- magic_len = bam_read(fp, buf, 4);
- if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
- fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
- return 0;
- }
- header = bam_header_init();
- // read plain text and the number of reference sequences
- bam_read(fp, &header->l_text, 4);
- if (bam_is_be) bam_swap_endian_4p(&header->l_text);
- header->text = (char*)calloc(header->l_text + 1, 1);
- bam_read(fp, header->text, header->l_text);
- bam_read(fp, &header->n_targets, 4);
- if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
- // read reference sequence names and lengths
- header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
- header->target_len = (uint32_t*)calloc(header->n_targets, 4);
- for (i = 0; i != header->n_targets; ++i) {
- bam_read(fp, &name_len, 4);
- if (bam_is_be) bam_swap_endian_4p(&name_len);
- header->target_name[i] = (char*)calloc(name_len, 1);
- bam_read(fp, header->target_name[i], name_len);
- bam_read(fp, &header->target_len[i], 4);
- if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
- }
- return header;
-}
-
-int bam_header_write(bamFile fp, const bam_header_t *header)
-{
- char buf[4];
- int32_t i, name_len, x;
- // write "BAM1"
- strncpy(buf, "BAM\001", 4);
- bam_write(fp, buf, 4);
- // write plain text and the number of reference sequences
- if (bam_is_be) {
- x = bam_swap_endian_4(header->l_text);
- bam_write(fp, &x, 4);
- if (header->l_text) bam_write(fp, header->text, header->l_text);
- x = bam_swap_endian_4(header->n_targets);
- bam_write(fp, &x, 4);
- } else {
- bam_write(fp, &header->l_text, 4);
- if (header->l_text) bam_write(fp, header->text, header->l_text);
- bam_write(fp, &header->n_targets, 4);
- }
- // write sequence names and lengths
- for (i = 0; i != header->n_targets; ++i) {
- char *p = header->target_name[i];
- name_len = strlen(p) + 1;
- if (bam_is_be) {
- x = bam_swap_endian_4(name_len);
- bam_write(fp, &x, 4);
- } else bam_write(fp, &name_len, 4);
- bam_write(fp, p, name_len);
- if (bam_is_be) {
- x = bam_swap_endian_4(header->target_len[i]);
- bam_write(fp, &x, 4);
- } else bam_write(fp, &header->target_len[i], 4);
- }
- bgzf_flush(fp);
- return 0;
-}
-
-static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
-{
- uint8_t *s;
- uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
- s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
- for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
- while (s < data + data_len) {
- uint8_t type;
- s += 2; // skip key
- type = toupper(*s); ++s; // skip type
- if (type == 'C' || type == 'A') ++s;
- else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
- else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
- else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
- else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
- else if (type == 'B') {
- int32_t n, Bsize = bam_aux_type2size(*s);
- memcpy(&n, s + 1, 4);
- if (1 == Bsize) {
- } else if (2 == Bsize) {
- for (i = 0; i < n; i += 2)
- bam_swap_endian_2p(s + 5 + i);
- } else if (4 == Bsize) {
- for (i = 0; i < n; i += 4)
- bam_swap_endian_4p(s + 5 + i);
- }
- bam_swap_endian_4p(s+1);
- }
- }
-}
-
-int bam_read1(bamFile fp, bam1_t *b)
-{
- bam1_core_t *c = &b->core;
- int32_t block_len, ret, i;
- uint32_t x[8];
-
- assert(BAM_CORE_SIZE == 32);
- if ((ret = bam_read(fp, &block_len, 4)) != 4) {
- if (ret == 0) return -1; // normal end-of-file
- else return -2; // truncated
- }
- if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
- if (bam_is_be) {
- bam_swap_endian_4p(&block_len);
- for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
- }
- c->tid = x[0]; c->pos = x[1];
- c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
- c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
- c->l_qseq = x[4];
- c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
- b->data_len = block_len - BAM_CORE_SIZE;
- if (b->m_data < b->data_len) {
- b->m_data = b->data_len;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
- }
- if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
- b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
- if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
- if (bam_no_B) bam_remove_B(b);
- return 4 + block_len;
-}
-
-inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
-{
- uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
- int i;
- assert(BAM_CORE_SIZE == 32);
- x[0] = c->tid;
- x[1] = c->pos;
- x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
- x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
- x[4] = c->l_qseq;
- x[5] = c->mtid;
- x[6] = c->mpos;
- x[7] = c->isize;
- bgzf_flush_try(fp, 4 + block_len);
- if (bam_is_be) {
- for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
- y = block_len;
- bam_write(fp, bam_swap_endian_4p(&y), 4);
- swap_endian_data(c, data_len, data);
- } else bam_write(fp, &block_len, 4);
- bam_write(fp, x, BAM_CORE_SIZE);
- bam_write(fp, data, data_len);
- if (bam_is_be) swap_endian_data(c, data_len, data);
- return 4 + block_len;
-}
-
-int bam_write1(bamFile fp, const bam1_t *b)
-{
- return bam_write1_core(fp, &b->core, b->data_len, b->data);
-}
-
-char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
-{
- uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
- int i;
- const bam1_core_t *c = &b->core;
- kstring_t str;
- str.l = str.m = 0; str.s = 0;
-
- kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
- if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
- else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
- else { // BAM_OFSTR
- for (i = 0; i < 16; ++i)
- if ((c->flag & 1<<i) && bam_flag2char_table[i])
- kputc(bam_flag2char_table[i], &str);
- kputc('\t', &str);
- }
- if (c->tid < 0) kputsn("*\t", 2, &str);
- else {
- if (header) kputs(header->target_name[c->tid] , &str);
- else kputw(c->tid, &str);
- kputc('\t', &str);
- }
- kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
- if (c->n_cigar == 0) kputc('*', &str);
- else {
- uint32_t *cigar = bam1_cigar(b);
- for (i = 0; i < c->n_cigar; ++i) {
- kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
- kputc(bam_cigar_opchr(cigar[i]), &str);
- }
- }
- kputc('\t', &str);
- if (c->mtid < 0) kputsn("*\t", 2, &str);
- else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
- else {
- if (header) kputs(header->target_name[c->mtid], &str);
- else kputw(c->mtid, &str);
- kputc('\t', &str);
- }
- kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
- if (c->l_qseq) {
- for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
- kputc('\t', &str);
- if (t[0] == 0xff) kputc('*', &str);
- else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
- } else kputsn("*\t*", 3, &str);
- s = bam1_aux(b);
- while (s < b->data + b->data_len) {
- uint8_t type, key[2];
- key[0] = s[0]; key[1] = s[1];
- s += 2; type = *s; ++s;
- kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
- if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
- else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
- else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
- else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
- else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
- else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
- else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
- else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
- else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
- else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
- else if (type == 'B') {
- uint8_t sub_type = *(s++);
- int32_t n;
- memcpy(&n, s, 4);
- s += 4; // no point to the start of the array
- kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing
- for (i = 0; i < n; ++i) {
- kputc(',', &str);
- if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
- else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
- else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
- else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
- else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
- else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
- else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
- }
- }
- }
- return str.s;
-}
-
-char *bam_format1(const bam_header_t *header, const bam1_t *b)
-{
- return bam_format1_core(header, b, BAM_OFDEC);
-}
-
-void bam_view1(const bam_header_t *header, const bam1_t *b)
-{
- char *s = bam_format1(header, b);
- puts(s);
- free(s);
-}
-
-int bam_validate1(const bam_header_t *header, const bam1_t *b)
-{
- char *s;
-
- if (b->core.tid < -1 || b->core.mtid < -1) return 0;
- if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
-
- if (b->data_len < b->core.l_qname) return 0;
- s = memchr(bam1_qname(b), '\0', b->core.l_qname);
- if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
-
- // FIXME: Other fields could also be checked, especially the auxiliary data
-
- return 1;
-}
-
-// FIXME: we should also check the LB tag associated with each alignment
-const char *bam_get_library(bam_header_t *h, const bam1_t *b)
-{
- const uint8_t *rg;
- if (h->dict == 0) h->dict = sam_header_parse2(h->text);
- if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
- rg = bam_aux_get(b, "RG");
- return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
-}
-
-/************
- * Remove B *
- ************/
-
-int bam_remove_B(bam1_t *b)
-{
- int i, j, end_j, k, l, no_qual;
- uint32_t *cigar, *new_cigar;
- uint8_t *seq, *qual, *p;
- // test if removal is necessary
- if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
- cigar = bam1_cigar(b);
- for (k = 0; k < b->core.n_cigar; ++k)
- if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
- if (k == b->core.n_cigar) return 0; // no 'B'
- if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
- // allocate memory for the new CIGAR
- if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
- b->m_data = b->data_len + b->core.n_cigar * 4;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
- cigar = bam1_cigar(b); // after realloc, cigar may be changed
- }
- new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
- // the core loop
- seq = bam1_seq(b); qual = bam1_qual(b);
- no_qual = (qual[0] == 0xff); // test whether base quality is available
- i = j = 0; end_j = -1;
- for (k = l = 0; k < b->core.n_cigar; ++k) {
- int op = bam_cigar_op(cigar[k]);
- int len = bam_cigar_oplen(cigar[k]);
- if (op == BAM_CBACK) { // the backward operation
- int t, u;
- if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
- if (len > j) goto rmB_err; // an excessively long backward
- for (t = l - 1, u = 0; t >= 0; --t) { // look back
- int op1 = bam_cigar_op(new_cigar[t]);
- int len1 = bam_cigar_oplen(new_cigar[t]);
- if (bam_cigar_type(op1)&1) { // consume the query
- if (u + len1 >= len) { // stop
- new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
- break;
- } else u += len1;
- }
- }
- if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
- l = t + 1;
- end_j = j; j -= len;
- } else { // other CIGAR operations
- new_cigar[l++] = cigar[k];
- if (bam_cigar_type(op)&1) { // consume the query
- if (i != j) { // no need to copy if i == j
- int u, c, c0;
- for (u = 0; u < len; ++u) { // construct the consensus
- c = bam1_seqi(seq, i+u);
- if (j + u < end_j) { // in an overlap
- c0 = bam1_seqi(seq, j+u);
- if (c != c0) { // a mismatch; choose the better base
- if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
- bam1_seq_seti(seq, j+u, c);
- qual[j+u] = qual[i+u] - qual[j+u];
- } else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
- } else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
- } else { // not in an overlap; copy over
- bam1_seq_seti(seq, j+u, c);
- qual[j+u] = qual[i+u];
- }
- }
- }
- i += len, j += len;
- }
- }
- }
- if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
- // merge adjacent operations if possible
- for (k = 1; k < l; ++k)
- if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
- new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
- // kill zero length operations
- for (k = i = 0; k < l; ++k)
- if (new_cigar[k] >> BAM_CIGAR_SHIFT)
- new_cigar[i++] = new_cigar[k];
- l = i;
- // update b
- memcpy(cigar, new_cigar, l * 4); // set CIGAR
- p = b->data + b->core.l_qname + l * 4;
- memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
- memmove(p, qual, j); p += j; // set QUAL
- memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
- b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
- b->data_len = p - b->data; // update record length
- return 0;
-
-rmB_err:
- b->core.flag |= BAM_FUNMAP;
- return -1;
-}
diff --git a/sam/bam.h b/sam/bam.h
deleted file mode 100644
index 80e8703..0000000
--- a/sam/bam.h
+++ /dev/null
@@ -1,793 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008-2010 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <***@sanger.ac.uk> */
-
-#ifndef BAM_BAM_H
-#define BAM_BAM_H
-
-/*!
- @header
-
- BAM library provides I/O and various operations on manipulating files
- in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
- format. It now supports importing from or exporting to SAM, sorting,
- merging, generating pileup, and quickly retrieval of reads overlapped
- with a specified region.
-
- @copyright Genome Research Ltd.
- */
-
-#define BAM_VERSION "0.1.19-44428cd"
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-#ifndef BAM_LITE
-#define BAM_VIRTUAL_OFFSET16
-#include "bgzf.h"
-/*! @abstract BAM file handler */
-typedef BGZF *bamFile;
-#define bam_open(fn, mode) bgzf_open(fn, mode)
-#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
-#define bam_close(fp) bgzf_close(fp)
-#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
-#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
-#define bam_tell(fp) bgzf_tell(fp)
-#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
-#else
-#define BAM_TRUE_OFFSET
-#include <zlib.h>
-typedef gzFile bamFile;
-#define bam_open(fn, mode) gzopen(fn, mode)
-#define bam_dopen(fd, mode) gzdopen(fd, mode)
-#define bam_close(fp) gzclose(fp)
-#define bam_read(fp, buf, size) gzread(fp, buf, size)
-/* no bam_write/bam_tell/bam_seek() here */
-#endif
-
-/*! @typedef
- @abstract Structure for the alignment header.
- @field n_targets number of reference sequences
- @field target_name names of the reference sequences
- @field target_len lengths of the referene sequences
- @field dict header dictionary
- @field hash hash table for fast name lookup
- @field rg2lib hash table for @RG-ID -> LB lookup
- @field l_text length of the plain text in the header
- @field text plain text
-
- @discussion Field hash points to null by default. It is a private
- member.
- */
-typedef struct {
- int32_t n_targets;
- char **target_name;
- uint32_t *target_len;
- void *dict, *hash, *rg2lib;
- uint32_t l_text, n_text;
- char *text;
-} bam_header_t;
-
-/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
-#define BAM_FPAIRED 1
-/*! @abstract the read is mapped in a proper pair */
-#define BAM_FPROPER_PAIR 2
-/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
-#define BAM_FUNMAP 4
-/*! @abstract the mate is unmapped */
-#define BAM_FMUNMAP 8
-/*! @abstract the read is mapped to the reverse strand */
-#define BAM_FREVERSE 16
-/*! @abstract the mate is mapped to the reverse strand */
-#define BAM_FMREVERSE 32
-/*! @abstract this is read1 */
-#define BAM_FREAD1 64
-/*! @abstract this is read2 */
-#define BAM_FREAD2 128
-/*! @abstract not primary alignment */
-#define BAM_FSECONDARY 256
-/*! @abstract QC failure */
-#define BAM_FQCFAIL 512
-/*! @abstract optical or PCR duplicate */
-#define BAM_FDUP 1024
-
-#define BAM_OFDEC 0
-#define BAM_OFHEX 1
-#define BAM_OFSTR 2
-
-/*! @abstract defautl mask for pileup */
-#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
-
-#define BAM_CORE_SIZE sizeof(bam1_core_t)
-
-/**
- * Describing how CIGAR operation/length is packed in a 32-bit integer.
- */
-#define BAM_CIGAR_SHIFT 4
-#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
-
-/*
- CIGAR operations.
- */
-/*! @abstract CIGAR: M = match or mismatch*/
-#define BAM_CMATCH 0
-/*! @abstract CIGAR: I = insertion to the reference */
-#define BAM_CINS 1
-/*! @abstract CIGAR: D = deletion from the reference */
-#define BAM_CDEL 2
-/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
-#define BAM_CREF_SKIP 3
-/*! @abstract CIGAR: S = clip on the read with clipped sequence
- present in qseq */
-#define BAM_CSOFT_CLIP 4
-/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
-#define BAM_CHARD_CLIP 5
-/*! @abstract CIGAR: P = padding */
-#define BAM_CPAD 6
-/*! @abstract CIGAR: equals = match */
-#define BAM_CEQUAL 7
-/*! @abstract CIGAR: X = mismatch */
-#define BAM_CDIFF 8
-#define BAM_CBACK 9
-
-#define BAM_CIGAR_STR "MIDNSHP=XB"
-#define BAM_CIGAR_TYPE 0x3C1A7
-
-#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
-#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
-#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)])
-#define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
-#define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
-
-/*! @typedef
- @abstract Structure for core alignment information.
- @field tid chromosome ID, defined by bam_header_t
- @field pos 0-based leftmost coordinate
- @field bin bin calculated by bam_reg2bin()
- @field qual mapping quality
- @field l_qname length of the query name
- @field flag bitwise flag
- @field n_cigar number of CIGAR operations
- @field l_qseq length of the query sequence (read)
- */
-typedef struct {
- int32_t tid;
- int32_t pos;
- uint32_t bin:16, qual:8, l_qname:8;
- uint32_t flag:16, n_cigar:16;
- int32_t l_qseq;
- int32_t mtid;
- int32_t mpos;
- int32_t isize;
-} bam1_core_t;
-
-/*! @typedef
- @abstract Structure for one alignment.
- @field core core information about the alignment
- @field l_aux length of auxiliary data
- @field data_len current length of bam1_t::data
- @field m_data maximum length of bam1_t::data
- @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
-
- @discussion Notes:
-
- 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
- 2. l_qseq is calculated from the total length of an alignment block
- on reading or from CIGAR.
- 3. cigar data is encoded 4 bytes per CIGAR operation.
- 4. seq is nybble-encoded according to bam_nt16_table.
- */
-typedef struct {
- bam1_core_t core;
- int l_aux, data_len, m_data;
- uint8_t *data;
-} bam1_t;
-
-typedef struct __bam_iter_t *bam_iter_t;
-
-#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
-#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
-
-/*! @function
- @abstract Get the CIGAR array
- @param b pointer to an alignment
- @return pointer to the CIGAR array
-
- @discussion In the CIGAR array, each element is a 32-bit integer. The
- lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
- length of a CIGAR.
- */
-#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
-
-/*! @function
- @abstract Get the name of the query
- @param b pointer to an alignment
- @return pointer to the name string, null terminated
- */
-#define bam1_qname(b) ((char*)((b)->data))
-
-/*! @function
- @abstract Get query sequence
- @param b pointer to an alignment
- @return pointer to sequence
-
- @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
- 8 for T and 15 for N. Two bases are packed in one byte with the base
- at the higher 4 bits having smaller coordinate on the read. It is
- recommended to use bam1_seqi() macro to get the base.
- */
-#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
-
-/*! @function
- @abstract Get query quality
- @param b pointer to an alignment
- @return pointer to quality string
- */
-#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
-
-/*! @function
- @abstract Get a base on read
- @param s Query sequence returned by bam1_seq()
- @param i The i-th position, 0-based
- @return 4-bit integer representing the base.
- */
-//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
-#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
-
-#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
-
-/*! @function
- @abstract Get query sequence and quality
- @param b pointer to an alignment
- @return pointer to the concatenated auxiliary data
- */
-#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
-
-#ifndef kroundup32
-/*! @function
- @abstract Round an integer to the next closest power-2 integer.
- @param x integer to be rounded (in place)
- @discussion x will be modified.
- */
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-/*!
- @abstract Whether the machine is big-endian; modified only in
- bam_header_init().
- */
-extern int bam_is_be;
-
-/*!
- @abstract Verbose level between 0 and 3; 0 is supposed to disable all
- debugging information, though this may not have been implemented.
- */
-extern int bam_verbose;
-
-extern int bam_no_B;
-
-/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
-extern unsigned char bam_nt16_table[256];
-
-/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
-extern char *bam_nt16_rev_table;
-
-extern char bam_nt16_nt4_table[];
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /*********************
- * Low-level SAM I/O *
- *********************/
-
- /*! @abstract TAM file handler */
- typedef struct __tamFile_t *tamFile;
-
- /*!
- @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
- @param fn SAM file name
- @return SAM file handler
- */
- tamFile sam_open(const char *fn);
-
- /*!
- @abstract Close a SAM file handler
- @param fp SAM file handler
- */
- void sam_close(tamFile fp);
-
- /*!
- @abstract Read one alignment from a SAM file handler
- @param fp SAM file handler
- @param header header information (ordered names of chromosomes)
- @param b read alignment; all members in b will be updated
- @return 0 if successful; otherwise negative
- */
- int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
-
- /*!
- @abstract Read header information from a TAB-delimited list file.
- @param fn_list file name for the list
- @return a pointer to the header structure
-
- @discussion Each line in this file consists of chromosome name and
- the length of chromosome.
- */
- bam_header_t *sam_header_read2(const char *fn_list);
-
- /*!
- @abstract Read header from a SAM file (if present)
- @param fp SAM file handler
- @return pointer to header struct; 0 if no @SQ lines available
- */
- bam_header_t *sam_header_read(tamFile fp);
-
- /*!
- @abstract Parse @SQ lines a update a header struct
- @param h pointer to the header struct to be updated
- @return number of target sequences
-
- @discussion bam_header_t::{n_targets,target_len,target_name} will
- be destroyed in the first place.
- */
- int sam_header_parse(bam_header_t *h);
- int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
-
- /*!
- @abstract Parse @RG lines a update a header struct
- @param h pointer to the header struct to be updated
- @return number of @RG lines
-
- @discussion bam_header_t::rg2lib will be destroyed in the first
- place.
- */
- int sam_header_parse_rg(bam_header_t *h);
-
-#define sam_write1(header, b) bam_view1(header, b)
-
-
- /********************************
- * APIs for string dictionaries *
- ********************************/
-
- int bam_strmap_put(void *strmap, const char *rg, const char *lib);
- const char *bam_strmap_get(const void *strmap, const char *rg);
- void *bam_strmap_dup(const void*);
- void *bam_strmap_init();
- void bam_strmap_destroy(void *strmap);
-
-
- /*********************
- * Low-level BAM I/O *
- *********************/
-
- /*!
- @abstract Initialize a header structure.
- @return the pointer to the header structure
-
- @discussion This function also modifies the global variable
- bam_is_be.
- */
- bam_header_t *bam_header_init();
-
- /*!
- @abstract Destroy a header structure.
- @param header pointer to the header
- */
- void bam_header_destroy(bam_header_t *header);
-
- /*!
- @abstract Read a header structure from BAM.
- @param fp BAM file handler, opened by bam_open()
- @return pointer to the header structure
-
- @discussion The file position indicator must be placed at the
- beginning of the file. Upon success, the position indicator will
- be set at the start of the first alignment.
- */
- bam_header_t *bam_header_read(bamFile fp);
-
- /*!
- @abstract Write a header structure to BAM.
- @param fp BAM file handler
- @param header pointer to the header structure
- @return always 0 currently
- */
- int bam_header_write(bamFile fp, const bam_header_t *header);
-
- /*!
- @abstract Read an alignment from BAM.
- @param fp BAM file handler
- @param b read alignment; all members are updated.
- @return number of bytes read from the file
-
- @discussion The file position indicator must be
- placed right before an alignment. Upon success, this function
- will set the position indicator to the start of the next
- alignment. This function is not affected by the machine
- endianness.
- */
- int bam_read1(bamFile fp, bam1_t *b);
-
- int bam_remove_B(bam1_t *b);
-
- /*!
- @abstract Write an alignment to BAM.
- @param fp BAM file handler
- @param c pointer to the bam1_core_t structure
- @param data_len total length of variable size data related to
- the alignment
- @param data pointer to the concatenated data
- @return number of bytes written to the file
-
- @discussion This function is not affected by the machine
- endianness.
- */
- int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
-
- /*!
- @abstract Write an alignment to BAM.
- @param fp BAM file handler
- @param b alignment to write
- @return number of bytes written to the file
-
- @abstract It is equivalent to:
- bam_write1_core(fp, &b->core, b->data_len, b->data)
- */
- int bam_write1(bamFile fp, const bam1_t *b);
-
- /*! @function
- @abstract Initiate a pointer to bam1_t struct
- */
-#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
-
- /*! @function
- @abstract Free the memory allocated for an alignment.
- @param b pointer to an alignment
- */
-#define bam_destroy1(b) do { \
- if (b) { free((b)->data); free(b); } \
- } while (0)
-
- /*!
- @abstract Format a BAM record in the SAM format
- @param header pointer to the header structure
- @param b alignment to print
- @return a pointer to the SAM string
- */
- char *bam_format1(const bam_header_t *header, const bam1_t *b);
-
- char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
-
- /*!
- @abstract Check whether a BAM record is plausibly valid
- @param header associated header structure, or NULL if unavailable
- @param b alignment to validate
- @return 0 if the alignment is invalid; non-zero otherwise
-
- @discussion Simple consistency check of some of the fields of the
- alignment record. If the header is provided, several additional checks
- are made. Not all fields are checked, so a non-zero result is not a
- guarantee that the record is valid. However it is usually good enough
- to detect when bam_seek() has been called with a virtual file offset
- that is not the offset of an alignment record.
- */
- int bam_validate1(const bam_header_t *header, const bam1_t *b);
-
- const char *bam_get_library(bam_header_t *header, const bam1_t *b);
-
-
- /***************
- * pileup APIs *
- ***************/
-
- /*! @typedef
- @abstract Structure for one alignment covering the pileup position.
- @field b pointer to the alignment
- @field qpos position of the read base at the pileup site, 0-based
- @field indel indel length; 0 for no indel, positive for ins and negative for del
- @field is_del 1 iff the base on the padded read is a deletion
- @field level the level of the read in the "viewer" mode
-
- @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
- difference between the two functions is that the former does not
- set bam_pileup1_t::level, while the later does. Level helps the
- implementation of alignment viewers, but calculating this has some
- overhead.
- */
- typedef struct {
- bam1_t *b;
- int32_t qpos;
- int indel, level;
- uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
- } bam_pileup1_t;
-
- typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
-
- struct __bam_plp_t;
- typedef struct __bam_plp_t *bam_plp_t;
-
- bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
- int bam_plp_push(bam_plp_t iter, const bam1_t *b);
- const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
- const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
- void bam_plp_set_mask(bam_plp_t iter, int mask);
- void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
- void bam_plp_reset(bam_plp_t iter);
- void bam_plp_destroy(bam_plp_t iter);
-
- struct __bam_mplp_t;
- typedef struct __bam_mplp_t *bam_mplp_t;
-
- bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
- void bam_mplp_destroy(bam_mplp_t iter);
- void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
- int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
-
- /*! @typedef
- @abstract Type of function to be called by bam_plbuf_push().
- @param tid chromosome ID as is defined in the header
- @param pos start coordinate of the alignment, 0-based
- @param n number of elements in pl array
- @param pl array of alignments
- @param data user provided data
- @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
- */
- typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
-
- typedef struct {
- bam_plp_t iter;
- bam_pileup_f func;
- void *data;
- } bam_plbuf_t;
-
- void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
- void bam_plbuf_reset(bam_plbuf_t *buf);
- bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
- void bam_plbuf_destroy(bam_plbuf_t *buf);
- int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
-
- int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
-
- struct __bam_lplbuf_t;
- typedef struct __bam_lplbuf_t bam_lplbuf_t;
-
- void bam_lplbuf_reset(bam_lplbuf_t *buf);
-
- /*! @abstract bam_plbuf_init() equivalent with level calculated. */
- bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
-
- /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
- void bam_lplbuf_destroy(bam_lplbuf_t *tv);
-
- /*! @abstract bam_plbuf_push() equivalent with level calculated. */
- int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
-
-
- /*********************
- * BAM indexing APIs *
- *********************/
-
- struct __bam_index_t;
- typedef struct __bam_index_t bam_index_t;
-
- /*!
- @abstract Build index for a BAM file.
- @discussion Index file "fn.bai" will be created.
- @param fn name of the BAM file
- @return always 0 currently
- */
- int bam_index_build(const char *fn);
-
- /*!
- @abstract Load index from file "fn.bai".
- @param fn name of the BAM file (NOT the index file)
- @return pointer to the index structure
- */
- bam_index_t *bam_index_load(const char *fn);
-
- /*!
- @abstract Destroy an index structure.
- @param idx pointer to the index structure
- */
- void bam_index_destroy(bam_index_t *idx);
-
- /*! @typedef
- @abstract Type of function to be called by bam_fetch().
- @param b the alignment
- @param data user provided data
- */
- typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
-
- /*!
- @abstract Retrieve the alignments that are overlapped with the
- specified region.
-
- @discussion A user defined function will be called for each
- retrieved alignment ordered by its start position.
-
- @param fp BAM file handler
- @param idx pointer to the alignment index
- @param tid chromosome ID as is defined in the header
- @param beg start coordinate, 0-based
- @param end end coordinate, 0-based
- @param data user provided data (will be transferred to func)
- @param func user defined function
- */
- int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
-
- bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
- int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
- void bam_iter_destroy(bam_iter_t iter);
-
- /*!
- @abstract Parse a region in the format: "chr2:100,000-200,000".
- @discussion bam_header_t::hash will be initialized if empty.
- @param header pointer to the header structure
- @param str string to be parsed
- @param ref_id the returned chromosome ID
- @param begin the returned start coordinate
- @param end the returned end coordinate
- @return 0 on success; -1 on failure
- */
- int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
-
-
- /**************************
- * APIs for optional tags *
- **************************/
-
- /*!
- @abstract Retrieve data of a tag
- @param b pointer to an alignment struct
- @param tag two-character tag to be retrieved
-
- @return pointer to the type and data. The first character is the
- type that can be 'iIsScCdfAZH'.
-
- @discussion Use bam_aux2?() series to convert the returned data to
- the corresponding type.
- */
- uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
-
- int32_t bam_aux2i(const uint8_t *s);
- float bam_aux2f(const uint8_t *s);
- double bam_aux2d(const uint8_t *s);
- char bam_aux2A(const uint8_t *s);
- char *bam_aux2Z(const uint8_t *s);
-
- int bam_aux_del(bam1_t *b, uint8_t *s);
- void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
- uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
-
-
- /*****************
- * Miscellaneous *
- *****************/
-
- /*!
- @abstract Calculate the rightmost coordinate of an alignment on the
- reference genome.
-
- @param c pointer to the bam1_core_t structure
- @param cigar the corresponding CIGAR array (from bam1_t::cigar)
- @return the rightmost coordinate, 0-based
- */
- uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
-
- /*!
- @abstract Calculate the length of the query sequence from CIGAR.
- @param c pointer to the bam1_core_t structure
- @param cigar the corresponding CIGAR array (from bam1_t::cigar)
- @return length of the query sequence
- */
- int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
-
-#ifdef __cplusplus
-}
-#endif
-
-/*!
- @abstract Calculate the minimum bin that contains a region [beg,end).
- @param beg start of the region, 0-based
- @param end end of the region, 0-based
- @return bin
- */
-static inline int bam_reg2bin(uint32_t beg, uint32_t end)
-{
- --end;
- if (beg>>14 == end>>14) return 4681 + (beg>>14);
- if (beg>>17 == end>>17) return 585 + (beg>>17);
- if (beg>>20 == end>>20) return 73 + (beg>>20);
- if (beg>>23 == end>>23) return 9 + (beg>>23);
- if (beg>>26 == end>>26) return 1 + (beg>>26);
- return 0;
-}
-
-/*!
- @abstract Copy an alignment
- @param bdst destination alignment struct
- @param bsrc source alignment struct
- @return pointer to the destination alignment struct
- */
-static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
-{
- uint8_t *data = bdst->data;
- int m_data = bdst->m_data; // backup data and m_data
- if (m_data < bsrc->data_len) { // double the capacity
- m_data = bsrc->data_len; kroundup32(m_data);
- data = (uint8_t*)realloc(data, m_data);
- }
- memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
- *bdst = *bsrc; // copy the rest
- // restore the backup
- bdst->m_data = m_data;
- bdst->data = data;
- return bdst;
-}
-
-/*!
- @abstract Duplicate an alignment
- @param src source alignment struct
- @return pointer to the destination alignment struct
- */
-static inline bam1_t *bam_dup1(const bam1_t *src)
-{
- bam1_t *b;
- b = bam_init1();
- *b = *src;
- b->m_data = b->data_len;
- b->data = (uint8_t*)calloc(b->data_len, 1);
- memcpy(b->data, src->data, b->data_len);
- return b;
-}
-
-static inline int bam_aux_type2size(int x)
-{
- if (x == 'C' || x == 'c' || x == 'A') return 1;
- else if (x == 'S' || x == 's') return 2;
- else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4;
- else return 0;
-}
-
-/*********************************
- *** Compatibility with htslib ***
- *********************************/
-
-typedef bam_header_t bam_hdr_t;
-
-#define bam_get_qname(b) bam1_qname(b)
-#define bam_get_cigar(b) bam1_cigar(b)
-
-#define bam_hdr_read(fp) bam_header_read(fp)
-#define bam_hdr_write(fp, h) bam_header_write(fp, h)
-#define bam_hdr_destroy(fp) bam_header_destroy(fp)
-
-#endif
diff --git a/sam/bam2bcf.c b/sam/bam2bcf.c
deleted file mode 100644
index 340b10b..0000000
--- a/sam/bam2bcf.c
+++ /dev/null
@@ -1,467 +0,0 @@
-#include <math.h>
-#include <stdint.h>
-#include <assert.h>
-#include "bam.h"
-#include "kstring.h"
-#include "bam2bcf.h"
-#include "errmod.h"
-#include "bcftools/bcf.h"
-
-extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
-
-#define CALL_ETA 0.03f
-#define CALL_MAX 256
-#define CALL_DEFTHETA 0.83f
-#define DEF_MAPQ 20
-
-#define CAP_DIST 25
-
-bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
-{
- bcf_callaux_t *bca;
- if (theta <= 0.) theta = CALL_DEFTHETA;
- bca = calloc(1, sizeof(bcf_callaux_t));
- bca->capQ = 60;
- bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
- bca->min_baseQ = min_baseQ;
- bca->e = errmod_init(1. - theta);
- bca->min_frac = 0.002;
- bca->min_support = 1;
- bca->per_sample_flt = 0;
- bca->npos = 100;
- bca->ref_pos = calloc(bca->npos, sizeof(int));
- bca->alt_pos = calloc(bca->npos, sizeof(int));
- return bca;
-}
-
-
-static int get_position(const bam_pileup1_t *p, int *len)
-{
- int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
- for (icig=0; icig<p->b->core.n_cigar; icig++)
- {
- // Conversion from uint32_t to MIDNSHP
- // 0123456
- // MIDNSHP
- int cig = bam1_cigar(p->b)[icig] & BAM_CIGAR_MASK;
- int ncig = bam1_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
- if ( cig==0 )
- {
- n_tot_bases += ncig;
- iread += ncig;
- }
- else if ( cig==1 )
- {
- n_tot_bases += ncig;
- iread += ncig;
- }
- else if ( cig==4 )
- {
- iread += ncig;
- if ( iread<=p->qpos ) edist -= ncig;
- }
- }
- *len = n_tot_bases;
- return edist;
-}
-
-void bcf_call_destroy(bcf_callaux_t *bca)
-{
- if (bca == 0) return;
- errmod_destroy(bca->e);
- if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
- free(bca->bases); free(bca->inscns); free(bca);
-}
-/* ref_base is the 4-bit representation of the reference base. It is
- * negative if we are looking at an indel. */
-int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
-{
- int i, n, ref4, is_indel, ori_depth = 0;
- memset(r, 0, sizeof(bcf_callret1_t));
- if (ref_base >= 0) {
- ref4 = bam_nt16_nt4_table[ref_base];
- is_indel = 0;
- } else ref4 = 4, is_indel = 1;
- if (_n == 0) return -1;
- // enlarge the bases array if necessary
- if (bca->max_bases < _n) {
- bca->max_bases = _n;
- kroundup32(bca->max_bases);
- bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
- }
- // fill the bases array
- for (i = n = r->n_supp = 0; i < _n; ++i) {
- const bam_pileup1_t *p = pl + i;
- int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
- // set base
- if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
- ++ori_depth;
- baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality
- seqQ = is_indel? (p->aux>>8&0xff) : 99;
- if (q < bca->min_baseQ) continue;
- if (q > seqQ) q = seqQ;
- mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
- mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
- if (q > mapQ) q = mapQ;
- if (q > 63) q = 63;
- if (q < 4) q = 4;
- if (!is_indel) {
- b = bam1_seqi(bam1_seq(p->b), p->qpos); // base
- b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
- is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
- } else {
- b = p->aux>>16&0x3f;
- is_diff = (b != 0);
- }
- if (is_diff) ++r->n_supp;
- bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
- // collect annotations
- if (b < 4) r->qsum[b] += q;
- ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)];
- min_dist = p->b->core.l_qseq - 1 - p->qpos;
- if (min_dist > p->qpos) min_dist = p->qpos;
- if (min_dist > CAP_DIST) min_dist = CAP_DIST;
- r->anno[1<<2|is_diff<<1|0] += baseQ;
- r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
- r->anno[2<<2|is_diff<<1|0] += mapQ;
- r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
- r->anno[3<<2|is_diff<<1|0] += min_dist;
- r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
-
- // collect read positions for ReadPosBias
- int len, pos = get_position(p, &len);
- int epos = (double)pos/(len+1) * bca->npos;
- if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base )
- bca->ref_pos[epos]++;
- else
- bca->alt_pos[epos]++;
- }
- r->depth = n; r->ori_depth = ori_depth;
- // glfgen
- errmod_cal(bca->e, n, 5, bca->bases, r->p);
- return r->depth;
-}
-
-double mann_whitney_1947(int n, int m, int U)
-{
- if (U<0) return 0;
- if (n==0||m==0) return U==0 ? 1 : 0;
- return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
-}
-
-void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call)
-{
- int i, nref = 0, nalt = 0;
- unsigned long int U = 0;
- for (i=0; i<bca->npos; i++)
- {
- nref += bca->ref_pos[i];
- nalt += bca->alt_pos[i];
- U += nref*bca->alt_pos[i];
- bca->ref_pos[i] = 0;
- bca->alt_pos[i] = 0;
- }
-#if 0
-//todo
- double var = 0, avg = (double)(nref+nalt)/bca->npos;
- for (i=0; i<bca->npos; i++)
- {
- double ediff = bca->ref_pos[i] + bca->alt_pos[i] - avg;
- var += ediff*ediff;
- bca->ref_pos[i] = 0;
- bca->alt_pos[i] = 0;
- }
- call->read_pos.avg = avg;
- call->read_pos.var = sqrt(var/bca->npos);
- call->read_pos.dp = nref+nalt;
-#endif
- if ( !nref || !nalt )
- {
- call->read_pos_bias = -1;
- return;
- }
-
- if ( nref>=8 || nalt>=8 )
- {
- // normal approximation
- double mean = ((double)nref*nalt+1.0)/2.0;
- double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0;
- double z = (U-mean)/sqrt(var2);
- call->read_pos_bias = z;
- //fprintf(stderr,"nref=%d nalt=%d U=%ld mean=%e var=%e zval=%e\n", nref,nalt,U,mean,sqrt(var2),call->read_pos_bias);
- }
- else
- {
- double p = mann_whitney_1947(nalt,nref,U);
- // biased form claimed by GATK to behave better empirically
- // double var2 = (1.0+1.0/(nref+nalt+1.0))*(double)nref*nalt*(nref+nalt+1.0)/12.0;
- double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0;
- double z;
- if ( p >= 1./sqrt(var2*2*M_PI) ) z = 0; // equal to mean
- else
- {
- if ( U >= nref*nalt/2. ) z = sqrt(-2*log(sqrt(var2*2*M_PI)*p));
- else z = -sqrt(-2*log(sqrt(var2*2*M_PI)*p));
- }
- call->read_pos_bias = z;
- //fprintf(stderr,"nref=%d nalt=%d U=%ld p=%e var2=%e zval=%e\n", nref,nalt,U, p,var2,call->read_pos_bias);
- }
-}
-
-float mean_diff_to_prob(float mdiff, int dp, int readlen)
-{
- if ( dp==2 )
- {
- if ( mdiff==0 )
- return (2.0*readlen + 4.0*(readlen-1.0))/((float)readlen*readlen);
- else
- return 8.0*(readlen - 4.0*mdiff)/((float)readlen*readlen);
- }
-
- // This is crude empirical approximation and is not very accurate for
- // shorter read lengths (<100bp). There certainly is a room for
- // improvement.
- const float mv[24][2] = { {0,0}, {0,0}, {0,0},
- { 9.108, 4.934}, { 9.999, 3.991}, {10.273, 3.485}, {10.579, 3.160},
- {10.828, 2.889}, {11.014, 2.703}, {11.028, 2.546}, {11.244, 2.391},
- {11.231, 2.320}, {11.323, 2.138}, {11.403, 2.123}, {11.394, 1.994},
- {11.451, 1.928}, {11.445, 1.862}, {11.516, 1.815}, {11.560, 1.761},
- {11.544, 1.728}, {11.605, 1.674}, {11.592, 1.652}, {11.674, 1.613},
- {11.641, 1.570} };
-
- float m, v;
- if ( dp>=24 )
- {
- m = readlen/8.;
- if (dp>100) dp = 100;
- v = 1.476/(0.182*pow(dp,0.514));
- v = v*(readlen/100.);
- }
- else
- {
- m = mv[dp][0];
- v = mv[dp][1];
- m = m*readlen/100.;
- v = v*readlen/100.;
- v *= 1.2; // allow more variability
- }
- return 1.0/(v*sqrt(2*M_PI)) * exp(-0.5*((mdiff-m)/v)*((mdiff-m)/v));
-}
-
-void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call)
-{
- int i, dp = 0;
- float mean_pos = 0, mean_diff = 0;
- for (i=0; i<bca->npos; i++)
- {
- if ( !bca->alt_pos[i] ) continue;
- dp += bca->alt_pos[i];
- int j = i<bca->npos/2 ? i : bca->npos - i;
- mean_pos += bca->alt_pos[i]*j;
- }
- if ( dp<2 )
- {
- call->vdb = -1;
- return;
- }
- mean_pos /= dp;
- for (i=0; i<bca->npos; i++)
- {
- if ( !bca->alt_pos[i] ) continue;
- int j = i<bca->npos/2 ? i : bca->npos - i;
- mean_diff += bca->alt_pos[i] * fabs(j - mean_pos);
- }
- mean_diff /= dp;
- call->vdb = mean_diff_to_prob(mean_diff, dp, bca->npos);
-}
-
-/**
- * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
- * @n: number of samples
- * @calls: each sample's calls
- * @bca: auxiliary data structure for holding temporary values
- * @ref_base: the reference base
- * @call: filled with the annotations
- */
-int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
-{
- int ref4, i, j, qsum[4];
- int64_t tmp;
- if (ref_base >= 0) {
- call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
- if (ref4 > 4) ref4 = 4;
- } else call->ori_ref = -1, ref4 = 0;
- // calculate qsum
- memset(qsum, 0, 4 * sizeof(int));
- for (i = 0; i < n; ++i)
- for (j = 0; j < 4; ++j)
- qsum[j] += calls[i].qsum[j];
- int qsum_tot=0;
- for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; }
- for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j;
- // find the top 2 alleles
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && qsum[j] < qsum[j-1]; --j)
- tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
- // set the reference allele and alternative allele(s)
- for (i = 0; i < 5; ++i) call->a[i] = -1;
- call->unseen = -1;
- call->a[0] = ref4;
- for (i = 3, j = 1; i >= 0; --i) {
- if ((qsum[i]&3) != ref4) {
- if (qsum[i]>>2 != 0)
- {
- if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4
- call->a[j++] = qsum[i]&3;
- }
- else break;
- }
- else
- call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot;
- }
- if (ref_base >= 0) { // for SNPs, find the "unseen" base
- if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
- call->unseen = j, call->a[j++] = qsum[i]&3;
- call->n_alleles = j;
- } else {
- call->n_alleles = j;
- if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
- }
- // set the PL array
- if (call->n < n) {
- call->n = n;
- call->PL = realloc(call->PL, 15 * n);
- }
- {
- int x, g[15], z;
- double sum_min = 0.;
- x = call->n_alleles * (call->n_alleles + 1) / 2;
- // get the possible genotypes
- for (i = z = 0; i < call->n_alleles; ++i)
- for (j = 0; j <= i; ++j)
- g[z++] = call->a[j] * 5 + call->a[i];
- for (i = 0; i < n; ++i) {
- uint8_t *PL = call->PL + x * i;
- const bcf_callret1_t *r = calls + i;
- float min = 1e37;
- for (j = 0; j < x; ++j)
- if (min > r->p[g[j]]) min = r->p[g[j]];
- sum_min += min;
- for (j = 0; j < x; ++j) {
- int y;
- y = (int)(r->p[g[j]] - min + .499);
- if (y > 255) y = 255;
- PL[j] = y;
- }
- }
-// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
- call->shift = (int)(sum_min + .499);
- }
- // combine annotations
- memset(call->anno, 0, 16 * sizeof(int));
- for (i = call->depth = call->ori_depth = 0, tmp = 0; i < n; ++i) {
- call->depth += calls[i].depth;
- call->ori_depth += calls[i].ori_depth;
- for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
- }
-
- calc_vdb(bca, call);
- calc_ReadPosBias(bca, call);
-
- return 0;
-}
-
-int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
- const bcf_callaux_t *bca, const char *ref)
-{
- extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
- kstring_t s;
- int i, j;
- b->n_smpl = bc->n;
- b->tid = tid; b->pos = pos; b->qual = 0;
- s.s = b->str; s.m = b->m_str; s.l = 0;
- kputc('\0', &s);
- if (bc->ori_ref < 0) { // an indel
- // write REF
- kputc(ref[pos], &s);
- for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
- kputc('\0', &s);
- // write ALT
- kputc(ref[pos], &s);
- for (i = 1; i < 4; ++i) {
- if (bc->a[i] < 0) break;
- if (i > 1) {
- kputc(',', &s); kputc(ref[pos], &s);
- }
- if (bca->indel_types[bc->a[i]] < 0) { // deletion
- for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
- kputc(ref[pos+1+j], &s);
- } else { // insertion; cannot be a reference unless a bug
- char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
- for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
- kputc("ACGTN"[(int)inscns[j]], &s);
- for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
- }
- }
- kputc('\0', &s);
- } else { // a SNP
- kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
- for (i = 1; i < 5; ++i) {
- if (bc->a[i] < 0) break;
- if (i > 1) kputc(',', &s);
- kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
- }
- kputc('\0', &s);
- }
- kputc('\0', &s);
- // INFO
- if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac);
- kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
- for (i = 0; i < 16; ++i) {
- if (i) kputc(',', &s);
- kputw(bc->anno[i], &s);
- }
- //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var);
- ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]);
- if (bc->vdb != -1)
- ksprintf(&s, ";VDB=%e", bc->vdb);
- if (bc->read_pos_bias != -1 )
- ksprintf(&s, ";RPB=%e", bc->read_pos_bias);
- kputc('\0', &s);
- // FMT
- kputs("PL", &s);
- if (bcr && fmt_flag) {
- if (fmt_flag & B2B_FMT_DP) kputs(":DP", &s);
- if (fmt_flag & B2B_FMT_DV) kputs(":DV", &s);
- if (fmt_flag & B2B_FMT_SP) kputs(":SP", &s);
- }
- kputc('\0', &s);
- b->m_str = s.m; b->str = s.s; b->l_str = s.l;
- bcf_sync(b);
- memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);
- if (bcr && fmt_flag) {
- uint16_t *dp = (fmt_flag & B2B_FMT_DP)? b->gi[1].data : 0;
- uint16_t *dv = (fmt_flag & B2B_FMT_DV)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0)].data : 0;
- int32_t *sp = (fmt_flag & B2B_FMT_SP)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0) + ((fmt_flag & B2B_FMT_DV) != 0)].data : 0;
- for (i = 0; i < bc->n; ++i) {
- bcf_callret1_t *p = bcr + i;
- if (dp) dp[i] = p->depth < 0xffff? p->depth : 0xffff;
- if (dv) dv[i] = p->n_supp < 0xffff? p->n_supp : 0xffff;
- if (sp) {
- if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2
- || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2)
- {
- sp[i] = 0;
- } else {
- double left, right, two;
- int x;
- kt_fisher_exact(p->anno[0], p->anno[1], p->anno[2], p->anno[3], &left, &right, &two);
- x = (int)(-4.343 * log(two) + .499);
- if (x > 255) x = 255;
- sp[i] = x;
- }
- }
- }
- }
- return 0;
-}
diff --git a/sam/bam2bcf.h b/sam/bam2bcf.h
deleted file mode 100644
index b2b1825..0000000
--- a/sam/bam2bcf.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef BAM2BCF_H
-#define BAM2BCF_H
-
-#include <stdint.h>
-#include "errmod.h"
-#include "bcftools/bcf.h"
-
-#define B2B_INDEL_NULL 10000
-
-#define B2B_FMT_DP 0x1
-#define B2B_FMT_SP 0x2
-#define B2B_FMT_DV 0x4
-
-typedef struct __bcf_callaux_t {
- int capQ, min_baseQ;
- int openQ, extQ, tandemQ; // for indels
- int min_support, max_support; // for collecting indel candidates
- double min_frac, max_frac; // for collecting indel candidates
- int per_sample_flt; // indel filtering strategy
- int *ref_pos, *alt_pos, npos; // for ReadPosBias
- // for internal uses
- int max_bases;
- int indel_types[4];
- int maxins, indelreg;
- int read_len;
- char *inscns;
- uint16_t *bases;
- errmod_t *e;
- void *rghash;
-} bcf_callaux_t;
-
-typedef struct {
- int depth, n_supp, ori_depth, qsum[4];
- unsigned int anno[16];
- float p[25];
-} bcf_callret1_t;
-
-typedef struct {
- int a[5]; // alleles: ref, alt, alt2, alt3
- float qsum[4];
- int n, n_alleles, shift, ori_ref, unseen;
- int n_supp; // number of supporting non-reference reads
- unsigned int anno[16], depth, ori_depth;
- uint8_t *PL;
- float vdb; // variant distance bias
- float read_pos_bias;
- struct { float avg, var; int dp; } read_pos;
-} bcf_call_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
- void bcf_call_destroy(bcf_callaux_t *bca);
- int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
- int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
- int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
- const bcf_callaux_t *bca, const char *ref);
- int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
- const void *rghash);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/bam2bcf_indel.c b/sam/bam2bcf_indel.c
deleted file mode 100644
index 30b3f46..0000000
--- a/sam/bam2bcf_indel.c
+++ /dev/null
@@ -1,498 +0,0 @@
-#include <assert.h>
-#include <ctype.h>
-#include <string.h>
-#include "bam.h"
-#include "bam2bcf.h"
-#include "kaln.h"
-#include "kprobaln.h"
-#include "khash.h"
-KHASH_SET_INIT_STR(rg)
-
-#include "ksort.h"
-KSORT_INIT_GENERIC(uint32_t)
-
-#define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 50
-
-void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
-{
- const char *s, *p, *q, *r, *t;
- khash_t(rg) *hash;
- if (list == 0 || hdtext == 0) return _hash;
- if (_hash == 0) _hash = kh_init(rg);
- hash = (khash_t(rg)*)_hash;
- if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
- do {
- t = strstr(s + 4, "@RG\t"); // the next @RG
- if ((p = strstr(s, "\tID:")) != 0) p += 4;
- if ((q = strstr(s, "\tPL:")) != 0) q += 4;
- if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
- int lp, lq;
- char *x;
- for (r = p; *r && *r != '\t' && *r != '\n'; ++r); lp = r - p;
- for (r = q; *r && *r != '\t' && *r != '\n'; ++r); lq = r - q;
- x = calloc((lp > lq? lp : lq) + 1, 1);
- for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
- if (strstr(list, x)) { // insert ID to the hash table
- khint_t k;
- int ret;
- for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
- x[r-p] = 0;
- k = kh_get(rg, hash, x);
- if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
- else free(x);
- } else free(x);
- }
- s = t;
- } while (s);
- return hash;
-}
-
-void bcf_call_del_rghash(void *_hash)
-{
- khint_t k;
- khash_t(rg) *hash = (khash_t(rg)*)_hash;
- if (hash == 0) return;
- for (k = kh_begin(hash); k < kh_end(hash); ++k)
- if (kh_exist(hash, k))
- free((char*)kh_key(hash, k));
- kh_destroy(rg, hash);
-}
-
-static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
-{
- int k, x = c->pos, y = 0, last_y = 0;
- *_tpos = c->pos;
- for (k = 0; k < c->n_cigar; ++k) {
- int op = cigar[k] & BAM_CIGAR_MASK;
- int l = cigar[k] >> BAM_CIGAR_SHIFT;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (c->pos > tpos) return y;
- if (x + l > tpos) {
- *_tpos = tpos;
- return y + (tpos - x);
- }
- x += l; y += l;
- last_y = y;
- } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
- else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
- if (x + l > tpos) {
- *_tpos = is_left? x : x + l;
- return y;
- }
- x += l;
- }
- }
- *_tpos = x;
- return last_y;
-}
-// FIXME: check if the inserted sequence is consistent with the homopolymer run
-// l is the relative gap length and l_run is the length of the homopolymer on the reference
-static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
-{
- int q, qh;
- q = bca->openQ + bca->extQ * (abs(l) - 1);
- qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
- return q < qh? q : qh;
-}
-
-static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
-{
- int i, j, max = 0, max_i = pos, score = 0;
- l = abs(l);
- for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
- if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
- else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
- if (score < 0) break;
- if (max < score) max = score, max_i = i;
- }
- return max_i - pos;
-}
-
-/*
- * @n: number of samples
- */
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
- const void *rghash)
-{
- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
- int N, K, l_run, ref_type, n_alt;
- char *inscns = 0, *ref2, *query, **ref_sample;
- khash_t(rg) *hash = (khash_t(rg)*)rghash;
- if (ref == 0 || bca == 0) return -1;
- // mark filtered reads
- if (rghash) {
- N = 0;
- for (s = N = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- const uint8_t *rg = bam_aux_get(p->b, "RG");
- p->aux = 1; // filtered by default
- if (rg) {
- khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
- if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
- }
- }
- }
- if (N == 0) return -1; // no reads left
- }
- // determine if there is a gap
- for (s = N = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i)
- if (plp[s][i].indel != 0) break;
- if (i < n_plp[s]) break;
- }
- if (s == n) return -1; // there is no indel at this position.
- for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
- { // find out how many types of indels are present
- bca->max_support = bca->max_frac = 0;
- int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
- uint32_t *aux;
- aux = calloc(N + 1, 4);
- m = max_rd_len = 0;
- aux[m++] = MINUS_CONST; // zero indel is always a type
- for (s = 0; s < n; ++s) {
- int na = 0, nt = 0;
- for (i = 0; i < n_plp[s]; ++i) {
- const bam_pileup1_t *p = plp[s] + i;
- if (rghash == 0 || p->aux == 0) {
- ++nt;
- if (p->indel != 0) {
- ++na;
- aux[m++] = MINUS_CONST + p->indel;
- }
- }
- j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
- if (j > max_rd_len) max_rd_len = j;
- }
- float frac = (float)na/nt;
- if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
- indel_support_ok = 1;
- if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
- n_alt += na;
- n_tot += nt;
- }
- // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
- // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
- int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
- if ( nN*2>i ) { free(aux); return -1; }
-
- ks_introsort(uint32_t, m, aux);
- // squeeze out identical types
- for (i = 1, n_types = 1; i < m; ++i)
- if (aux[i] != aux[i-1]) ++n_types;
- // Taking totals makes it hard to call rare indels
- if ( !bca->per_sample_flt )
- indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
- if ( n_types == 1 || !indel_support_ok ) { // then skip
- free(aux); return -1;
- }
- if (n_types >= 64) {
- free(aux);
- if (bam_verbose >= 2)
- fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
- return -1;
- }
- types = (int*)calloc(n_types, sizeof(int));
- t = 0;
- types[t++] = aux[0] - MINUS_CONST;
- for (i = 1; i < m; ++i)
- if (aux[i] != aux[i-1])
- types[t++] = aux[i] - MINUS_CONST;
- free(aux);
- for (t = 0; t < n_types; ++t)
- if (types[t] == 0) break;
- ref_type = t; // the index of the reference type (0)
- }
- { // calculate left and right boundary
- left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
- right = pos + INDEL_WINDOW_SIZE;
- if (types[0] < 0) right -= types[0];
- // in case the alignments stand out the reference
- for (i = pos; i < right; ++i)
- if (ref[i] == 0) break;
- right = i;
- }
- /* The following block fixes a long-existing flaw in the INDEL
- * calling model: the interference of nearby SNPs. However, it also
- * reduces the power because sometimes, substitutions caused by
- * indels are not distinguishable from true mutations. Multiple
- * sequence realignment helps to increase the power.
- *
- * Masks mismatches present in at least 70% of the reads with 'N'.
- */
- { // construct per-sample consensus
- int L = right - left + 1, max_i, max2_i;
- uint32_t *cns, max, max2;
- char *ref0, *r;
- ref_sample = calloc(n, sizeof(void*));
- cns = calloc(L, 4);
- ref0 = calloc(L, 1);
- for (i = 0; i < right - left; ++i)
- ref0[i] = bam_nt16_table[(int)ref[i+left]];
- for (s = 0; s < n; ++s) {
- r = ref_sample[s] = calloc(L, 1);
- memset(cns, 0, sizeof(int) * L);
- // collect ref and non-ref counts
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- bam1_t *b = p->b;
- uint32_t *cigar = bam1_cigar(b);
- uint8_t *seq = bam1_seq(b);
- int x = b->core.pos, y = 0;
- for (k = 0; k < b->core.n_cigar; ++k) {
- int op = cigar[k]&0xf;
- int j, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j)
- if (x + j >= left && x + j < right)
- cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
- x += l; y += l;
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
- }
- }
- // determine the consensus
- for (i = 0; i < right - left; ++i) r[i] = ref0[i];
- max = max2 = 0; max_i = max2_i = -1;
- for (i = 0; i < right - left; ++i) {
- if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
- else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
- }
- if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
- if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
- if (max_i >= 0) r[max_i] = 15;
- if (max2_i >= 0) r[max2_i] = 15;
- //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
- }
- free(ref0); free(cns);
- }
- { // the length of the homopolymer run around the current position
- int c = bam_nt16_table[(int)ref[pos + 1]];
- if (c == 15) l_run = 1;
- else {
- for (i = pos + 2; ref[i]; ++i)
- if (bam_nt16_table[(int)ref[i]] != c) break;
- l_run = i;
- for (i = pos; i >= 0; --i)
- if (bam_nt16_table[(int)ref[i]] != c) break;
- l_run -= i + 1;
- }
- }
- // construct the consensus sequence
- max_ins = types[n_types - 1]; // max_ins is at least 0
- if (max_ins > 0) {
- int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int));
- // count the number of occurrences of each base at each position for each type of insertion
- for (t = 0; t < n_types; ++t) {
- if (types[t] > 0) {
- for (s = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- if (p->indel == types[t]) {
- uint8_t *seq = bam1_seq(p->b);
- for (k = 1; k <= p->indel; ++k) {
- int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)];
- assert(c<5);
- ++inscns_aux[(t*max_ins+(k-1))*5 + c];
- }
- }
- }
- }
- }
- }
- // use the majority rule to construct the consensus
- inscns = calloc(n_types * max_ins, 1);
- for (t = 0; t < n_types; ++t) {
- for (j = 0; j < types[t]; ++j) {
- int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
- for (k = 0; k < 5; ++k)
- if (ia[k] > max)
- max = ia[k], max_k = k;
- inscns[t*max_ins + j] = max? max_k : 4;
- if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
- }
- }
- free(inscns_aux);
- }
- // compute the likelihood given each type of indel for each read
- max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
- ref2 = calloc(max_ref2, 1);
- query = calloc(right - left + max_rd_len + max_ins + 2, 1);
- score1 = calloc(N * n_types, sizeof(int));
- score2 = calloc(N * n_types, sizeof(int));
- bca->indelreg = 0;
- for (t = 0; t < n_types; ++t) {
- int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
- apf1.bw = apf2.bw = abs(types[t]) + 3;
- // compute indelreg
- if (types[t] == 0) ir = 0;
- else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
- else ir = est_indelreg(pos, ref, -types[t], 0);
- if (ir > bca->indelreg) bca->indelreg = ir;
-// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
- // realignment
- for (s = K = 0; s < n; ++s) {
- // write ref2
- for (k = 0, j = left; j <= pos; ++j)
- ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
- if (types[t] <= 0) j += -types[t];
- else for (l = 0; l < types[t]; ++l)
- ref2[k++] = inscns[t*max_ins + l];
- for (; j < right && ref[j]; ++j)
- ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]];
- for (; k < max_ref2; ++k) ref2[k] = 4;
- if (j < right) right = j;
- // align each read to ref2
- for (i = 0; i < n_plp[s]; ++i, ++K) {
- bam_pileup1_t *p = plp[s] + i;
- int qbeg, qend, tbeg, tend, sc, kk;
- uint8_t *seq = bam1_seq(p->b);
- uint32_t *cigar = bam1_cigar(p->b);
- if (p->b->core.flag&4) continue; // unmapped reads
- // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
- for (kk = 0; kk < p->b->core.n_cigar; ++kk)
- if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
- if (kk < p->b->core.n_cigar) continue;
- // FIXME: the following skips soft clips, but using them may be more sensitive.
- // determine the start and end of sequences for alignment
- qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg);
- qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
- if (types[t] < 0) {
- int l = -types[t];
- tbeg = tbeg - l > left? tbeg - l : left;
- }
- // write the query sequence
- for (l = qbeg; l < qend; ++l)
- query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)];
- { // do realignment; this is the bottleneck
- const uint8_t *qual = bam1_qual(p->b), *bq;
- uint8_t *qq;
- qq = calloc(qend - qbeg, 1);
- bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
- if (bq) ++bq; // skip type
- for (l = qbeg; l < qend; ++l) {
- qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
- if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
- if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
- }
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
- l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
- if (l > 255) l = 255;
- score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
- if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
- l = (int)(100. * sc / (qend - qbeg) + .499);
- if (l > 255) l = 255;
- score2[K*n_types + t] = sc<<8 | l;
- }
- free(qq);
- }
-/*
- for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
- fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
- fputc('\n', stderr);
- for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
- fputc('\n', stderr);
- fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
-*/
- }
- }
- }
- free(ref2); free(query);
- { // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
- for (s = K = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i, ++K) {
- bam_pileup1_t *p = plp[s] + i;
- int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
- for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
- tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
- /* errmod_cal() assumes that if the call is wrong, the
- * likelihoods of other events are equal. This is about
- * right for substitutions, but is not desired for
- * indels. To reuse errmod_cal(), I have to make
- * compromise for multi-allelic indels.
- */
- if ((sc[0]&0x3f) == ref_type) {
- indelQ1 = (sc[1]>>14) - (sc[0]>>14);
- seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
- } else {
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sc[t]&0x3f) == ref_type) break;
- indelQ1 = (sc[t]>>14) - (sc[0]>>14);
- seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
- }
- tmp = sc[0]>>6 & 0xff;
- indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
- sct = &score2[K*n_types];
- for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
- tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
- if ((sc[0]&0x3f) == ref_type) {
- indelQ2 = (sc[1]>>14) - (sc[0]>>14);
- } else {
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sc[t]&0x3f) == ref_type) break;
- indelQ2 = (sc[t]>>14) - (sc[0]>>14);
- }
- tmp = sc[0]>>6 & 0xff;
- indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
- // pick the smaller between indelQ1 and indelQ2
- indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
- if (indelQ > 255) indelQ = 255;
- if (seqQ > 255) seqQ = 255;
- p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
- sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
- }
- }
- // determine bca->indel_types[] and bca->inscns
- bca->maxins = max_ins;
- bca->inscns = realloc(bca->inscns, bca->maxins * 4);
- for (t = 0; t < n_types; ++t)
- sumq[t] = sumq[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
- tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sumq[t]&0x3f) == ref_type) break;
- if (t) { // then move the reference type to the first
- tmp = sumq[t];
- for (; t > 0; --t) sumq[t] = sumq[t-1];
- sumq[0] = tmp;
- }
- for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
- for (t = 0; t < 4 && t < n_types; ++t) {
- bca->indel_types[t] = types[sumq[t]&0x3f];
- memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
- }
- // update p->aux
- for (s = n_alt = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- int x = types[p->aux>>16&0x3f];
- for (j = 0; j < 4; ++j)
- if (x == bca->indel_types[j]) break;
- p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
- if ((p->aux>>16&0x3f) > 0) ++n_alt;
-// fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d q=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, bca->indel_types[p->aux>>16&63], p->aux&0xff, p->aux>>8&0xff);
- }
- }
- }
- free(score1); free(score2);
- // free
- for (i = 0; i < n; ++i) free(ref_sample[i]);
- free(ref_sample);
- free(types); free(inscns);
- return n_alt > 0? 0 : -1;
-}
diff --git a/sam/bam2depth.c b/sam/bam2depth.c
deleted file mode 100644
index 02311ef..0000000
--- a/sam/bam2depth.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/* This program demonstrates how to generate pileup from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
- * To compile this program separately, you may:
- *
- * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz
- */
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include "bam.h"
-
-typedef struct { // auxiliary data structure
- bamFile fp; // the file handler
- bam_iter_t iter; // NULL if a region not specified
- int min_mapQ, min_len; // mapQ filter; length filter
-} aux_t;
-
-void *bed_read(const char *fn); // read a BED or position list file
-void bed_destroy(void *_h); // destroy the BED data structure
-int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
-
-// This function reads a BAM alignment from one BAM file.
-static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
-{
- aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
- int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
- if (!(b->core.flag&BAM_FUNMAP)) {
- if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
- else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
- }
- return ret;
-}
-
-int read_file_list(const char *file_list,int *n,char **argv[]);
-
-#ifdef _MAIN_BAM2DEPTH
-int main(int argc, char *argv[])
-#else
-int main_depth(int argc, char *argv[])
-#endif
-{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles;
- const bam_pileup1_t **plp;
- char *reg = 0; // specified region
- void *bed = 0; // BED data structure
- char *file_list = NULL, **fn = NULL;
- bam_header_t *h = 0; // BAM header of the 1st input
- aux_t **data;
- bam_mplp_t mplp;
-
- // parse the command line
- while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
- switch (n) {
- case 'l': min_len = atoi(optarg); break; // minimum query length
- case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
- case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
- case 'q': baseQ = atoi(optarg); break; // base quality threshold
- case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
- case 'f': file_list = optarg; break;
- }
- }
- if (optind == argc && !file_list) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
- fprintf(stderr, "Options:\n");
- fprintf(stderr, " -b <bed> list of positions or regions\n");
- fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -l <int> minQLen\n");
- fprintf(stderr, " -q <int> base quality threshold\n");
- fprintf(stderr, " -Q <int> mapping quality threshold\n");
- fprintf(stderr, " -r <chr:from-to> region\n");
- fprintf(stderr, "\n");
- return 1;
- }
-
- // initialize the auxiliary data structures
- if (file_list)
- {
- if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
- n = nfiles;
- argv = fn;
- optind = 0;
- }
- else
- n = argc - optind; // the number of BAMs on the command line
- data = calloc(n, sizeof(void*)); // data[i] for the i-th input
- beg = 0; end = 1<<30; tid = -1; // set the default region
- for (i = 0; i < n; ++i) {
- bam_header_t *htmp;
- data[i] = calloc(1, sizeof(aux_t));
- data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
- data[i]->min_mapQ = mapQ; // set the mapQ filter
- data[i]->min_len = min_len; // set the qlen filter
- htmp = bam_header_read(data[i]->fp); // read the BAM header
- if (i == 0) {
- h = htmp; // keep the header of the 1st BAM
- if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
- } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
- if (tid >= 0) { // if a region is specified and parsed successfully
- bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index
- data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
- bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
- }
- }
-
- // the core multi-pileup loop
- mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
- n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
- plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
- while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
- if (pos < beg || pos >= end) continue; // out of range; skip
- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
- fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
- for (i = 0; i < n; ++i) { // base level filters have to go here
- int j, m = 0;
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
- if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
- else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
- }
- printf("\t%d", n_plp[i] - m); // this the depth to output
- }
- putchar('\n');
- }
- free(n_plp); free(plp);
- bam_mplp_destroy(mplp);
-
- bam_header_destroy(h);
- for (i = 0; i < n; ++i) {
- bam_close(data[i]->fp);
- if (data[i]->iter) bam_iter_destroy(data[i]->iter);
- free(data[i]);
- }
- free(data); free(reg);
- if (bed) bed_destroy(bed);
- if ( file_list )
- {
- for (i=0; i<n; i++) free(fn[i]);
- free(fn);
- }
- return 0;
-}
diff --git a/sam/bam_aux.c b/sam/bam_aux.c
deleted file mode 100644
index 4bbf975..0000000
--- a/sam/bam_aux.c
+++ /dev/null
@@ -1,217 +0,0 @@
-#include <ctype.h>
-#include "bam.h"
-#include "khash.h"
-typedef char *str_p;
-KHASH_MAP_INIT_STR(s, int)
-KHASH_MAP_INIT_STR(r2l, str_p)
-
-void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
-{
- int ori_len = b->data_len;
- b->data_len += 3 + len;
- b->l_aux += 3 + len;
- if (b->m_data < b->data_len) {
- b->m_data = b->data_len;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
- }
- b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
- b->data[ori_len + 2] = type;
- memcpy(b->data + ori_len + 3, data, len);
-}
-
-uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
-{
- return bam_aux_get(b, tag);
-}
-
-#define __skip_tag(s) do { \
- int type = toupper(*(s)); \
- ++(s); \
- if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
- else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
- else (s) += bam_aux_type2size(type); \
- } while(0)
-
-uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
-{
- uint8_t *s;
- int y = tag[0]<<8 | tag[1];
- s = bam1_aux(b);
- while (s < b->data + b->data_len) {
- int x = (int)s[0]<<8 | s[1];
- s += 2;
- if (x == y) return s;
- __skip_tag(s);
- }
- return 0;
-}
-// s MUST BE returned by bam_aux_get()
-int bam_aux_del(bam1_t *b, uint8_t *s)
-{
- uint8_t *p, *aux;
- aux = bam1_aux(b);
- p = s - 2;
- __skip_tag(s);
- memmove(p, s, b->l_aux - (s - aux));
- b->data_len -= s - p;
- b->l_aux -= s - p;
- return 0;
-}
-
-int bam_aux_drop_other(bam1_t *b, uint8_t *s)
-{
- if (s) {
- uint8_t *p, *aux;
- aux = bam1_aux(b);
- p = s - 2;
- __skip_tag(s);
- memmove(aux, p, s - p);
- b->data_len -= b->l_aux - (s - p);
- b->l_aux = s - p;
- } else {
- b->data_len -= b->l_aux;
- b->l_aux = 0;
- }
- return 0;
-}
-
-void bam_init_header_hash(bam_header_t *header)
-{
- if (header->hash == 0) {
- int ret, i;
- khiter_t iter;
- khash_t(s) *h;
- header->hash = h = kh_init(s);
- for (i = 0; i < header->n_targets; ++i) {
- iter = kh_put(s, h, header->target_name[i], &ret);
- kh_value(h, iter) = i;
- }
- }
-}
-
-void bam_destroy_header_hash(bam_header_t *header)
-{
- if (header->hash)
- kh_destroy(s, (khash_t(s)*)header->hash);
-}
-
-int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
-{
- khint_t k;
- khash_t(s) *h = (khash_t(s)*)header->hash;
- k = kh_get(s, h, seq_name);
- return k == kh_end(h)? -1 : kh_value(h, k);
-}
-
-int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
-{
- char *s;
- int i, l, k, name_end;
- khiter_t iter;
- khash_t(s) *h;
-
- bam_init_header_hash(header);
- h = (khash_t(s)*)header->hash;
-
- *ref_id = *beg = *end = -1;
- name_end = l = strlen(str);
- s = (char*)malloc(l+1);
- // remove space
- for (i = k = 0; i < l; ++i)
- if (!isspace(str[i])) s[k++] = str[i];
- s[k] = 0; l = k;
- // determine the sequence name
- for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
- if (i >= 0) name_end = i;
- if (name_end < l) { // check if this is really the end
- int n_hyphen = 0;
- for (i = name_end + 1; i < l; ++i) {
- if (s[i] == '-') ++n_hyphen;
- else if (!isdigit(s[i]) && s[i] != ',') break;
- }
- if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
- s[name_end] = 0;
- iter = kh_get(s, h, s);
- if (iter == kh_end(h)) { // cannot find the sequence name
- iter = kh_get(s, h, str); // try str as the name
- if (iter == kh_end(h)) {
- if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
- free(s); return -1;
- } else s[name_end] = ':', name_end = l;
- }
- } else iter = kh_get(s, h, str);
- if (iter == kh_end(h)) {
- free(s);
- return -1;
- }
- *ref_id = kh_val(h, iter);
- // parse the interval
- if (name_end < l) {
- for (i = k = name_end + 1; i < l; ++i)
- if (s[i] != ',') s[k++] = s[i];
- s[k] = 0;
- *beg = atoi(s + name_end + 1);
- for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
- *end = i < k? atoi(s + i + 1) : 1<<29;
- if (*beg > 0) --*beg;
- } else *beg = 0, *end = 1<<29;
- free(s);
- return *beg <= *end? 0 : -1;
-}
-
-int32_t bam_aux2i(const uint8_t *s)
-{
- int type;
- if (s == 0) return 0;
- type = *s++;
- if (type == 'c') return (int32_t)*(int8_t*)s;
- else if (type == 'C') return (int32_t)*(uint8_t*)s;
- else if (type == 's') return (int32_t)*(int16_t*)s;
- else if (type == 'S') return (int32_t)*(uint16_t*)s;
- else if (type == 'i' || type == 'I') return *(int32_t*)s;
- else return 0;
-}
-
-float bam_aux2f(const uint8_t *s)
-{
- int type;
- type = *s++;
- if (s == 0) return 0.0;
- if (type == 'f') return *(float*)s;
- else return 0.0;
-}
-
-double bam_aux2d(const uint8_t *s)
-{
- int type;
- type = *s++;
- if (s == 0) return 0.0;
- if (type == 'd') return *(double*)s;
- else return 0.0;
-}
-
-char bam_aux2A(const uint8_t *s)
-{
- int type;
- type = *s++;
- if (s == 0) return 0;
- if (type == 'A') return *(char*)s;
- else return 0;
-}
-
-char *bam_aux2Z(const uint8_t *s)
-{
- int type;
- type = *s++;
- if (s == 0) return 0;
- if (type == 'Z' || type == 'H') return (char*)s;
- else return 0;
-}
-
-#ifdef _WIN32
-double drand48()
-{
- return (double)rand() / RAND_MAX;
-}
-#endif
diff --git a/sam/bam_cat.c b/sam/bam_cat.c
deleted file mode 100644
index a7502b9..0000000
--- a/sam/bam_cat.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-
-bam_cat -- efficiently concatenates bam files
-
-bam_cat can be used to concatenate BAM files. Under special
-circumstances, it can be used as an alternative to 'samtools merge' to
-concatenate multiple sorted files into a single sorted file. For this
-to work each file must be sorted, and the sorted files must be given
-as command line arguments in order such that the final read in file i
-is less than or equal to the first read in file i+1.
-
-This code is derived from the bam_reheader function in samtools 0.1.8
-and modified to perform concatenation by Chris Saunders on behalf of
-Illumina.
-
-
-########## License:
-
-The MIT License
-
-Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
-Modified SAMtools work copyright (c) 2010 Illumina, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-*/
-
-
-/*
-makefile:
-"""
-CC=gcc
-CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)
-LDFLAGS+=-L$(SAMTOOLS_DIR)
-LDLIBS+=-lbam -lz
-
-all:bam_cat
-"""
-*/
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include "knetfile.h"
-#include "bgzf.h"
-#include "bam.h"
-
-#define BUF_SIZE 0x10000
-
-#define GZIPID1 31
-#define GZIPID2 139
-
-#define BGZF_EMPTY_BLOCK_SIZE 28
-
-
-int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
-{
- BGZF *fp;
- FILE* fp_file;
- uint8_t *buf;
- uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
- const int es=BGZF_EMPTY_BLOCK_SIZE;
- int i;
-
- fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
- if (fp == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
- return 1;
- }
- if (h) bam_header_write(fp, h);
-
- buf = (uint8_t*) malloc(BUF_SIZE);
- for(i = 0; i < nfn; ++i){
- BGZF *in;
- bam_header_t *old;
- int len,j;
-
- in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");
- if (in == 0) {
- fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
- return -1;
- }
- if (in->is_write) return -1;
-
- old = bam_header_read(in);
- if (h == 0 && i == 0) bam_header_write(fp, old);
-
- if (in->block_offset < in->block_length) {
- bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
- bgzf_flush(fp);
- }
-
- j=0;
-#ifdef _USE_KNETFILE
- fp_file = fp->fp;
- while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) {
-#else
- fp_file = fp->fp;
- while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
-#endif
- if(len<es){
- int diff=es-len;
- if(j==0) {
- fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
- return -1;
- }
- fwrite(ebuf, 1, len, fp_file);
- memcpy(ebuf,ebuf+len,diff);
- memcpy(ebuf+diff,buf,len);
- } else {
- if(j!=0) fwrite(ebuf, 1, es, fp_file);
- len-= es;
- memcpy(ebuf,buf+len,es);
- fwrite(buf, 1, len, fp_file);
- }
- j=1;
- }
-
- /* check final gzip block */
- {
- const uint8_t gzip1=ebuf[0];
- const uint8_t gzip2=ebuf[1];
- const uint32_t isize=*((uint32_t*)(ebuf+es-4));
- if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
- fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
- fprintf(stderr, " Possible output corruption.\n");
- fwrite(ebuf, 1, es, fp_file);
- }
- }
- bam_header_destroy(old);
- bgzf_close(in);
- }
- free(buf);
- bgzf_close(fp);
- return 0;
-}
-
-
-
-int main_cat(int argc, char *argv[])
-{
- bam_header_t *h = 0;
- char *outfn = 0;
- int c, ret;
- while ((c = getopt(argc, argv, "h:o:")) >= 0) {
- switch (c) {
- case 'h': {
- tamFile fph = sam_open(optarg);
- if (fph == 0) {
- fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
- return 1;
- }
- h = sam_header_read(fph);
- sam_close(fph);
- break;
- }
- case 'o': outfn = strdup(optarg); break;
- }
- }
- if (argc - optind < 2) {
- fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
- return 1;
- }
- ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
- free(outfn);
- return ret;
-}
diff --git a/sam/bam_color.c b/sam/bam_color.c
deleted file mode 100644
index 8b86e2f..0000000
--- a/sam/bam_color.c
+++ /dev/null
@@ -1,145 +0,0 @@
-#include <ctype.h>
-#include "bam.h"
-
-/*!
- @abstract Get the color encoding the previous and current base
- @param b pointer to an alignment
- @param i The i-th position, 0-based
- @return color
-
- @discussion Returns 0 no color information is found.
- */
-char bam_aux_getCSi(bam1_t *b, int i)
-{
- uint8_t *c = bam_aux_get(b, "CS");
- char *cs = NULL;
-
- // return the base if the tag was not found
- if(0 == c) return 0;
-
- cs = bam_aux2Z(c);
- // adjust for strandedness and leading adaptor
- if(bam1_strand(b)) {
- i = strlen(cs) - 1 - i;
- // adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
- if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
- i -= cigar >> BAM_CIGAR_SHIFT;
- }
- } else { i++; }
- return cs[i];
-}
-
-/*!
- @abstract Get the color quality of the color encoding the previous and current base
- @param b pointer to an alignment
- @param i The i-th position, 0-based
- @return color quality
-
- @discussion Returns 0 no color information is found.
- */
-char bam_aux_getCQi(bam1_t *b, int i)
-{
- uint8_t *c = bam_aux_get(b, "CQ");
- char *cq = NULL;
-
- // return the base if the tag was not found
- if(0 == c) return 0;
-
- cq = bam_aux2Z(c);
- // adjust for strandedness
- if(bam1_strand(b)) {
- i = strlen(cq) - 1 - i;
- // adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
- if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
- i -= (cigar >> BAM_CIGAR_SHIFT);
- }
- }
- return cq[i];
-}
-
-char bam_aux_nt2int(char a)
-{
- switch(toupper(a)) {
- case 'A':
- return 0;
- break;
- case 'C':
- return 1;
- break;
- case 'G':
- return 2;
- break;
- case 'T':
- return 3;
- break;
- default:
- return 4;
- break;
- }
-}
-
-char bam_aux_ntnt2cs(char a, char b)
-{
- a = bam_aux_nt2int(a);
- b = bam_aux_nt2int(b);
- if(4 == a || 4 == b) return '4';
- return "0123"[(int)(a ^ b)];
-}
-
-/*!
- @abstract Get the color error profile at the give position
- @param b pointer to an alignment
- @return the original color if the color was an error, '-' (dash) otherwise
-
- @discussion Returns 0 no color information is found.
- */
-char bam_aux_getCEi(bam1_t *b, int i)
-{
- int cs_i;
- uint8_t *c = bam_aux_get(b, "CS");
- char *cs = NULL;
- char prev_b, cur_b;
- char cur_color, cor_color;
-
- // return the base if the tag was not found
- if(0 == c) return 0;
-
- cs = bam_aux2Z(c);
-
- // adjust for strandedness and leading adaptor
- if(bam1_strand(b)) { //reverse strand
- cs_i = strlen(cs) - 1 - i;
- // adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
- if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
- cs_i -= cigar >> BAM_CIGAR_SHIFT;
- }
- // get current color
- cur_color = cs[cs_i];
- // get previous base. Note: must rc adaptor
- prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
- // get current base
- cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
- }
- else {
- cs_i=i+1;
- // get current color
- cur_color = cs[cs_i];
- // get previous base
- prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
- // get current base
- cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
- }
-
- // corrected color
- cor_color = bam_aux_ntnt2cs(prev_b, cur_b);
-
- if(cur_color == cor_color) {
- return '-';
- }
- else {
- return cur_color;
- }
-}
diff --git a/sam/bam_endian.h b/sam/bam_endian.h
deleted file mode 100644
index 0fc74a8..0000000
--- a/sam/bam_endian.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef BAM_ENDIAN_H
-#define BAM_ENDIAN_H
-
-#include <stdint.h>
-
-static inline int bam_is_big_endian()
-{
- long one= 1;
- return !(*((char *)(&one)));
-}
-static inline uint16_t bam_swap_endian_2(uint16_t v)
-{
- return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
-}
-static inline void *bam_swap_endian_2p(void *x)
-{
- *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
- return x;
-}
-static inline uint32_t bam_swap_endian_4(uint32_t v)
-{
- v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
- return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
-}
-static inline void *bam_swap_endian_4p(void *x)
-{
- *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
- return x;
-}
-static inline uint64_t bam_swap_endian_8(uint64_t v)
-{
- v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
- v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
- return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
-}
-static inline void *bam_swap_endian_8p(void *x)
-{
- *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
- return x;
-}
-
-#endif
diff --git a/sam/bam_import.c b/sam/bam_import.c
deleted file mode 100644
index da2bf94..0000000
--- a/sam/bam_import.c
+++ /dev/null
@@ -1,489 +0,0 @@
-#include <zlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <assert.h>
-#ifdef _WIN32
-#include <fcntl.h>
-#endif
-#include "kstring.h"
-#include "bam.h"
-#include "sam_header.h"
-#include "kseq.h"
-#include "khash.h"
-
-KSTREAM_INIT(gzFile, gzread, 16384)
-KHASH_MAP_INIT_STR(ref, uint64_t)
-
-void bam_init_header_hash(bam_header_t *header);
-void bam_destroy_header_hash(bam_header_t *header);
-int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
-
-unsigned char bam_nt16_table[256] = {
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
- 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
- 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
- 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
- 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
- 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
-};
-
-unsigned short bam_char2flag_table[256] = {
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
- BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
- 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
-};
-
-char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
-
-struct __tamFile_t {
- gzFile fp;
- kstream_t *ks;
- kstring_t *str;
- uint64_t n_lines;
- int is_first;
-};
-
-char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
-{
- char **list = 0, *s;
- int n = 0, dret, m = 0;
- gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
- kstream_t *ks;
- kstring_t *str;
- str = (kstring_t*)calloc(1, sizeof(kstring_t));
- ks = ks_init(fp);
- while (ks_getuntil(ks, '\n', str, &dret) > 0) {
- if (n == m) {
- m = m? m << 1 : 16;
- list = (char**)realloc(list, m * sizeof(char*));
- }
- if (str->s[str->l-1] == '\r')
- str->s[--str->l] = '\0';
- s = list[n++] = (char*)calloc(str->l + 1, 1);
- strcpy(s, str->s);
- }
- ks_destroy(ks);
- gzclose(fp);
- free(str->s); free(str);
- *_n = n;
- return list;
-}
-
-static bam_header_t *hash2header(const kh_ref_t *hash)
-{
- bam_header_t *header;
- khiter_t k;
- header = bam_header_init();
- header->n_targets = kh_size(hash);
- header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
- header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
- for (k = kh_begin(hash); k != kh_end(hash); ++k) {
- if (kh_exist(hash, k)) {
- int i = (int)kh_value(hash, k);
- header->target_name[i] = (char*)kh_key(hash, k);
- header->target_len[i] = kh_value(hash, k)>>32;
- }
- }
- bam_init_header_hash(header);
- return header;
-}
-bam_header_t *sam_header_read2(const char *fn)
-{
- bam_header_t *header;
- int c, dret, ret, error = 0;
- gzFile fp;
- kstream_t *ks;
- kstring_t *str;
- kh_ref_t *hash;
- khiter_t k;
- if (fn == 0) return 0;
- fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
- if (fp == 0) return 0;
- hash = kh_init(ref);
- ks = ks_init(fp);
- str = (kstring_t*)calloc(1, sizeof(kstring_t));
- while (ks_getuntil(ks, 0, str, &dret) > 0) {
- char *s = strdup(str->s);
- int len, i;
- i = kh_size(hash);
- ks_getuntil(ks, 0, str, &dret);
- len = atoi(str->s);
- k = kh_put(ref, hash, s, &ret);
- if (ret == 0) {
- fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
- error = 1;
- }
- kh_value(hash, k) = (uint64_t)len<<32 | i;
- if (dret != '\n')
- while ((c = ks_getc(ks)) != '\n' && c != -1);
- }
- ks_destroy(ks);
- gzclose(fp);
- free(str->s); free(str);
- fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
- if (error) return 0;
- header = hash2header(hash);
- kh_destroy(ref, hash);
- return header;
-}
-static inline uint8_t *alloc_data(bam1_t *b, int size)
-{
- if (b->m_data < size) {
- b->m_data = size;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
- }
- return b->data;
-}
-static inline void parse_error(int64_t n_lines, const char * __restrict msg)
-{
- fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
- abort();
-}
-static inline void append_text(bam_header_t *header, kstring_t *str)
-{
- size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
- kroundup32(x); kroundup32(y);
- if (x < y)
- {
- header->n_text = y;
- header->text = (char*)realloc(header->text, y);
- if ( !header->text )
- {
- fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
- abort();
- }
- }
- // Sanity check
- if ( header->l_text+str->l+1 >= header->n_text )
- {
- fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y);
- abort();
- }
- strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
- header->l_text += str->l + 1;
- header->text[header->l_text] = 0;
-}
-
-int sam_header_parse(bam_header_t *h)
-{
- char **tmp;
- int i;
- free(h->target_len); free(h->target_name);
- h->n_targets = 0; h->target_len = 0; h->target_name = 0;
- if (h->l_text < 3) return 0;
- if (h->dict == 0) h->dict = sam_header_parse2(h->text);
- tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
- if (h->n_targets == 0) return 0;
- h->target_name = calloc(h->n_targets, sizeof(void*));
- for (i = 0; i < h->n_targets; ++i)
- h->target_name[i] = strdup(tmp[i]);
- free(tmp);
- tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
- h->target_len = calloc(h->n_targets, 4);
- for (i = 0; i < h->n_targets; ++i)
- h->target_len[i] = atoi(tmp[i]);
- free(tmp);
- return h->n_targets;
-}
-
-bam_header_t *sam_header_read(tamFile fp)
-{
- int ret, dret;
- bam_header_t *header = bam_header_init();
- kstring_t *str = fp->str;
- while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
- str->s[str->l] = dret; // note that str->s is NOT null terminated!!
- append_text(header, str);
- if (dret != '\n') {
- ret = ks_getuntil(fp->ks, '\n', str, &dret);
- str->s[str->l] = '\n'; // NOT null terminated!!
- append_text(header, str);
- }
- ++fp->n_lines;
- }
- sam_header_parse(header);
- bam_init_header_hash(header);
- fp->is_first = 1;
- return header;
-}
-
-int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
-{
- int ret, doff, doff0, dret, z = 0;
- bam1_core_t *c = &b->core;
- kstring_t *str = fp->str;
- kstream_t *ks = fp->ks;
-
- if (fp->is_first) {
- fp->is_first = 0;
- ret = str->l;
- } else {
- do { // special consideration for empty lines
- ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
- if (ret >= 0) z += str->l + 1;
- } while (ret == 0);
- }
- if (ret < 0) return -1;
- ++fp->n_lines;
- doff = 0;
-
- { // name
- c->l_qname = strlen(str->s) + 1;
- memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
- doff += c->l_qname;
- }
- { // flag
- long flag;
- char *s;
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
- flag = strtol((char*)str->s, &s, 0);
- if (*s) { // not the end of the string
- flag = 0;
- for (s = str->s; *s; ++s)
- flag |= bam_char2flag_table[(int)*s];
- }
- c->flag = flag;
- }
- { // tid, pos, qual
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
- if (c->tid < 0 && strcmp(str->s, "*")) {
- if (header->n_targets == 0) {
- fprintf(stderr, "[sam_read1] missing header? Abort!\n");
- exit(1);
- } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
- }
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
- if (ret < 0) return -2;
- }
- { // cigar
- char *s, *t;
- int i, op;
- long x;
- c->n_cigar = 0;
- if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
- z += str->l + 1;
- if (str->s[0] != '*') {
- uint32_t *cigar;
- for (s = str->s; *s; ++s) {
- if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
- else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
- }
- b->data = alloc_data(b, doff + c->n_cigar * 4);
- cigar = bam1_cigar(b);
- for (i = 0, s = str->s; i != c->n_cigar; ++i) {
- x = strtol(s, &t, 10);
- op = toupper(*t);
- if (op == 'M') op = BAM_CMATCH;
- else if (op == 'I') op = BAM_CINS;
- else if (op == 'D') op = BAM_CDEL;
- else if (op == 'N') op = BAM_CREF_SKIP;
- else if (op == 'S') op = BAM_CSOFT_CLIP;
- else if (op == 'H') op = BAM_CHARD_CLIP;
- else if (op == 'P') op = BAM_CPAD;
- else if (op == '=') op = BAM_CEQUAL;
- else if (op == 'X') op = BAM_CDIFF;
- else if (op == 'B') op = BAM_CBACK;
- else parse_error(fp->n_lines, "invalid CIGAR operation");
- s = t + 1;
- cigar[i] = bam_cigar_gen(x, op);
- }
- if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
- c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar));
- doff += c->n_cigar * 4;
- } else {
- if (!(c->flag&BAM_FUNMAP)) {
- fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
- c->flag |= BAM_FUNMAP;
- }
- c->bin = bam_reg2bin(c->pos, c->pos + 1);
- }
- }
- { // mtid, mpos, isize
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
- c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
- c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
- ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
- c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
- if (ret < 0) return -4;
- }
- { // seq and qual
- int i;
- uint8_t *p = 0;
- if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
- z += str->l + 1;
- if (strcmp(str->s, "*")) {
- c->l_qseq = strlen(str->s);
- if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
- fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
- (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
- parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
- }
- p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
- memset(p, 0, (c->l_qseq+1)/2);
- for (i = 0; i < c->l_qseq; ++i)
- p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
- } else c->l_qseq = 0;
- if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
- z += str->l + 1;
- if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
- parse_error(fp->n_lines, "sequence and quality are inconsistent");
- p += (c->l_qseq+1)/2;
- if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
- else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
- doff += c->l_qseq + (c->l_qseq+1)/2;
- }
- doff0 = doff;
- if (dret != '\n' && dret != '\r') { // aux
- while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
- uint8_t *s, type, key[2];
- z += str->l + 1;
- if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
- parse_error(fp->n_lines, "missing colon in auxiliary data");
- key[0] = str->s[0]; key[1] = str->s[1];
- type = str->s[3];
- s = alloc_data(b, doff + 3) + doff;
- s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
- if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
- s = alloc_data(b, doff + 2) + doff;
- *s++ = 'A'; *s = str->s[5];
- doff += 2;
- } else if (type == 'I' || type == 'i') {
- long long x;
- s = alloc_data(b, doff + 5) + doff;
- x = (long long)atoll(str->s + 5);
- if (x < 0) {
- if (x >= -127) {
- *s++ = 'c'; *(int8_t*)s = (int8_t)x;
- s += 1; doff += 2;
- } else if (x >= -32767) {
- *s++ = 's'; *(int16_t*)s = (int16_t)x;
- s += 2; doff += 3;
- } else {
- *s++ = 'i'; *(int32_t*)s = (int32_t)x;
- s += 4; doff += 5;
- if (x < -2147483648ll)
- fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
- (long long)fp->n_lines, x);
- }
- } else {
- if (x <= 255) {
- *s++ = 'C'; *s++ = (uint8_t)x;
- doff += 2;
- } else if (x <= 65535) {
- *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
- s += 2; doff += 3;
- } else {
- *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
- s += 4; doff += 5;
- if (x > 4294967295ll)
- fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
- (long long)fp->n_lines, x);
- }
- }
- } else if (type == 'f') {
- s = alloc_data(b, doff + 5) + doff;
- *s++ = 'f';
- *(float*)s = (float)atof(str->s + 5);
- s += 4; doff += 5;
- } else if (type == 'd') {
- s = alloc_data(b, doff + 9) + doff;
- *s++ = 'd';
- *(float*)s = (float)atof(str->s + 9);
- s += 8; doff += 9;
- } else if (type == 'Z' || type == 'H') {
- int size = 1 + (str->l - 5) + 1;
- if (type == 'H') { // check whether the hex string is valid
- int i;
- if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
- for (i = 0; i < str->l - 5; ++i) {
- int c = toupper(str->s[5 + i]);
- if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
- parse_error(fp->n_lines, "invalid hex character");
- }
- }
- s = alloc_data(b, doff + size) + doff;
- *s++ = type;
- memcpy(s, str->s + 5, str->l - 5);
- s[str->l - 5] = 0;
- doff += size;
- } else if (type == 'B') {
- int32_t n = 0, Bsize, k = 0, size;
- char *p;
- if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B");
- Bsize = bam_aux_type2size(str->s[5]); // the size of each element
- for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array
- if (*p == ',') ++n;
- p = str->s + 7; // now p points to the first number in the array
- size = 6 + Bsize * n; // total number of bytes allocated to this tag
- s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory
- *s++ = 'B'; *s++ = str->s[5];
- memcpy(s, &n, 4); s += 4; // write the number of elements
- if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p;
- else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p;
- else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory
- else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p;
- else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p;
- else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p;
- else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p;
- else parse_error(fp->n_lines, "unrecognized array type");
- s += Bsize * n; doff += size;
- } else parse_error(fp->n_lines, "unrecognized type");
- if (dret == '\n' || dret == '\r') break;
- }
- }
- b->l_aux = doff - doff0;
- b->data_len = doff;
- if (bam_no_B) bam_remove_B(b);
- return z;
-}
-
-tamFile sam_open(const char *fn)
-{
- tamFile fp;
- gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
- if (gzfp == 0) return 0;
- fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
- fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
- fp->fp = gzfp;
- fp->ks = ks_init(fp->fp);
- return fp;
-}
-
-void sam_close(tamFile fp)
-{
- if (fp) {
- ks_destroy(fp->ks);
- gzclose(fp->fp);
- free(fp->str->s); free(fp->str);
- free(fp);
- }
-}
diff --git a/sam/bam_index.c b/sam/bam_index.c
deleted file mode 100644
index f916e04..0000000
--- a/sam/bam_index.c
+++ /dev/null
@@ -1,726 +0,0 @@
-#include <ctype.h>
-#include <assert.h>
-#include "bam.h"
-#include "khash.h"
-#include "ksort.h"
-#include "bam_endian.h"
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-
-/*!
- @header
-
- Alignment indexing. Before indexing, BAM must be sorted based on the
- leftmost coordinate of alignments. In indexing, BAM uses two indices:
- a UCSC binning index and a simple linear index. The binning index is
- efficient for alignments spanning long distance, while the auxiliary
- linear index helps to reduce unnecessary seek calls especially for
- short alignments.
-
- The UCSC binning scheme was suggested by Richard Durbin and Lincoln
- Stein and is explained by Kent et al. (2002). In this scheme, each bin
- represents a contiguous genomic region which can be fully contained in
- another bin; each alignment is associated with a bin which represents
- the smallest region containing the entire alignment. The binning
- scheme is essentially another representation of R-tree. A distinct bin
- uniquely corresponds to a distinct internal node in a R-tree. Bin A is
- a child of Bin B if region A is contained in B.
-
- In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
- 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
- 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
- find the alignments overlapped with a region [rbeg,rend), we need to
- calculate the list of bins that may be overlapped the region and test
- the alignments in the bins to confirm the overlaps. If the specified
- region is short, typically only a few alignments in six bins need to
- be retrieved. The overlapping alignments can be quickly fetched.
-
- */
-
-#define BAM_MIN_CHUNK_GAP 32768
-// 1<<14 is the size of minimum bin.
-#define BAM_LIDX_SHIFT 14
-
-#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
-
-typedef struct {
- uint64_t u, v;
-} pair64_t;
-
-#define pair64_lt(a,b) ((a).u < (b).u)
-KSORT_INIT(off, pair64_t, pair64_lt)
-
-typedef struct {
- uint32_t m, n;
- pair64_t *list;
-} bam_binlist_t;
-
-typedef struct {
- int32_t n, m;
- uint64_t *offset;
-} bam_lidx_t;
-
-KHASH_MAP_INIT_INT(i, bam_binlist_t)
-
-struct __bam_index_t {
- int32_t n;
- uint64_t n_no_coor; // unmapped reads without coordinate
- khash_t(i) **index;
- bam_lidx_t *index2;
-};
-
-// requirement: len <= LEN_MASK
-static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
-{
- khint_t k;
- bam_binlist_t *l;
- int ret;
- k = kh_put(i, h, bin, &ret);
- l = &kh_value(h, k);
- if (ret) { // not present
- l->m = 1; l->n = 0;
- l->list = (pair64_t*)calloc(l->m, 16);
- }
- if (l->n == l->m) {
- l->m <<= 1;
- l->list = (pair64_t*)realloc(l->list, l->m * 16);
- }
- l->list[l->n].u = beg; l->list[l->n++].v = end;
-}
-
-static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
-{
- int i, beg, end;
- beg = b->core.pos >> BAM_LIDX_SHIFT;
- end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
- if (index2->m < end + 1) {
- int old_m = index2->m;
- index2->m = end + 1;
- kroundup32(index2->m);
- index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
- memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
- }
- if (beg == end) {
- if (index2->offset[beg] == 0) index2->offset[beg] = offset;
- } else {
- for (i = beg; i <= end; ++i)
- if (index2->offset[i] == 0) index2->offset[i] = offset;
- }
- index2->n = end + 1;
-}
-
-static void merge_chunks(bam_index_t *idx)
-{
-#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
- khash_t(i) *index;
- int i, l, m;
- khint_t k;
- for (i = 0; i < idx->n; ++i) {
- index = idx->index[i];
- for (k = kh_begin(index); k != kh_end(index); ++k) {
- bam_binlist_t *p;
- if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
- p = &kh_value(index, k);
- m = 0;
- for (l = 1; l < p->n; ++l) {
-#ifdef BAM_TRUE_OFFSET
- if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
-#else
- if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
-#endif
- else p->list[++m] = p->list[l];
- } // ~for(l)
- p->n = m + 1;
- } // ~for(k)
- } // ~for(i)
-#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
-}
-
-static void fill_missing(bam_index_t *idx)
-{
- int i, j;
- for (i = 0; i < idx->n; ++i) {
- bam_lidx_t *idx2 = &idx->index2[i];
- for (j = 1; j < idx2->n; ++j)
- if (idx2->offset[j] == 0)
- idx2->offset[j] = idx2->offset[j-1];
- }
-}
-
-bam_index_t *bam_index_core(bamFile fp)
-{
- bam1_t *b;
- bam_header_t *h;
- int i, ret;
- bam_index_t *idx;
- uint32_t last_bin, save_bin;
- int32_t last_coor, last_tid, save_tid;
- bam1_core_t *c;
- uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
-
- h = bam_header_read(fp);
- if(h == 0) {
- fprintf(stderr, "[bam_index_core] Invalid BAM header.");
- return NULL;
- }
-
- idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
- b = (bam1_t*)calloc(1, sizeof(bam1_t));
- c = &b->core;
-
- idx->n = h->n_targets;
- bam_header_destroy(h);
- idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
- for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
- idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
-
- save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
- save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
- n_mapped = n_unmapped = n_no_coor = off_end = 0;
- off_beg = off_end = bam_tell(fp);
- while ((ret = bam_read1(fp, b)) >= 0) {
- if (c->tid < 0) ++n_no_coor;
- if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
- last_tid = c->tid;
- last_bin = 0xffffffffu;
- } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
- fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
- bam1_qname(b), last_tid+1, c->tid+1);
- return NULL;
- } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
- fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
- bam1_qname(b), last_coor, c->pos, c->tid+1);
- return NULL;
- }
- if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
- if (c->bin != last_bin) { // then possibly write the binning index
- if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
- insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
- if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
- off_end = last_off;
- insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
- insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
- n_mapped = n_unmapped = 0;
- off_beg = off_end;
- }
- save_off = last_off;
- save_bin = last_bin = c->bin;
- save_tid = c->tid;
- if (save_tid < 0) break;
- }
- if (bam_tell(fp) <= last_off) {
- fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
- (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
- return NULL;
- }
- if (c->flag & BAM_FUNMAP) ++n_unmapped;
- else ++n_mapped;
- last_off = bam_tell(fp);
- last_coor = b->core.pos;
- }
- if (save_tid >= 0) {
- insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
- insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
- insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
- }
- merge_chunks(idx);
- fill_missing(idx);
- if (ret >= 0) {
- while ((ret = bam_read1(fp, b)) >= 0) {
- ++n_no_coor;
- if (c->tid >= 0 && n_no_coor) {
- fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
- return NULL;
- }
- }
- }
- if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
- free(b->data); free(b);
- idx->n_no_coor = n_no_coor;
- return idx;
-}
-
-void bam_index_destroy(bam_index_t *idx)
-{
- khint_t k;
- int i;
- if (idx == 0) return;
- for (i = 0; i < idx->n; ++i) {
- khash_t(i) *index = idx->index[i];
- bam_lidx_t *index2 = idx->index2 + i;
- for (k = kh_begin(index); k != kh_end(index); ++k) {
- if (kh_exist(index, k))
- free(kh_value(index, k).list);
- }
- kh_destroy(i, index);
- free(index2->offset);
- }
- free(idx->index); free(idx->index2);
- free(idx);
-}
-
-void bam_index_save(const bam_index_t *idx, FILE *fp)
-{
- int32_t i, size;
- khint_t k;
- fwrite("BAI\1", 1, 4, fp);
- if (bam_is_be) {
- uint32_t x = idx->n;
- fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
- } else fwrite(&idx->n, 4, 1, fp);
- for (i = 0; i < idx->n; ++i) {
- khash_t(i) *index = idx->index[i];
- bam_lidx_t *index2 = idx->index2 + i;
- // write binning index
- size = kh_size(index);
- if (bam_is_be) { // big endian
- uint32_t x = size;
- fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
- } else fwrite(&size, 4, 1, fp);
- for (k = kh_begin(index); k != kh_end(index); ++k) {
- if (kh_exist(index, k)) {
- bam_binlist_t *p = &kh_value(index, k);
- if (bam_is_be) { // big endian
- uint32_t x;
- x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
- x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
- for (x = 0; (int)x < p->n; ++x) {
- bam_swap_endian_8p(&p->list[x].u);
- bam_swap_endian_8p(&p->list[x].v);
- }
- fwrite(p->list, 16, p->n, fp);
- for (x = 0; (int)x < p->n; ++x) {
- bam_swap_endian_8p(&p->list[x].u);
- bam_swap_endian_8p(&p->list[x].v);
- }
- } else {
- fwrite(&kh_key(index, k), 4, 1, fp);
- fwrite(&p->n, 4, 1, fp);
- fwrite(p->list, 16, p->n, fp);
- }
- }
- }
- // write linear index (index2)
- if (bam_is_be) {
- int x = index2->n;
- fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
- } else fwrite(&index2->n, 4, 1, fp);
- if (bam_is_be) { // big endian
- int x;
- for (x = 0; (int)x < index2->n; ++x)
- bam_swap_endian_8p(&index2->offset[x]);
- fwrite(index2->offset, 8, index2->n, fp);
- for (x = 0; (int)x < index2->n; ++x)
- bam_swap_endian_8p(&index2->offset[x]);
- } else fwrite(index2->offset, 8, index2->n, fp);
- }
- { // write the number of reads coor-less records.
- uint64_t x = idx->n_no_coor;
- if (bam_is_be) bam_swap_endian_8p(&x);
- fwrite(&x, 8, 1, fp);
- }
- fflush(fp);
-}
-
-static bam_index_t *bam_index_load_core(FILE *fp)
-{
- int i;
- char magic[4];
- bam_index_t *idx;
- if (fp == 0) {
- fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
- return 0;
- }
- fread(magic, 1, 4, fp);
- if (strncmp(magic, "BAI\1", 4)) {
- fprintf(stderr, "[bam_index_load] wrong magic number.\n");
- fclose(fp);
- return 0;
- }
- idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
- fread(&idx->n, 4, 1, fp);
- if (bam_is_be) bam_swap_endian_4p(&idx->n);
- idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
- idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
- for (i = 0; i < idx->n; ++i) {
- khash_t(i) *index;
- bam_lidx_t *index2 = idx->index2 + i;
- uint32_t key, size;
- khint_t k;
- int j, ret;
- bam_binlist_t *p;
- index = idx->index[i] = kh_init(i);
- // load binning index
- fread(&size, 4, 1, fp);
- if (bam_is_be) bam_swap_endian_4p(&size);
- for (j = 0; j < (int)size; ++j) {
- fread(&key, 4, 1, fp);
- if (bam_is_be) bam_swap_endian_4p(&key);
- k = kh_put(i, index, key, &ret);
- p = &kh_value(index, k);
- fread(&p->n, 4, 1, fp);
- if (bam_is_be) bam_swap_endian_4p(&p->n);
- p->m = p->n;
- p->list = (pair64_t*)malloc(p->m * 16);
- fread(p->list, 16, p->n, fp);
- if (bam_is_be) {
- int x;
- for (x = 0; x < p->n; ++x) {
- bam_swap_endian_8p(&p->list[x].u);
- bam_swap_endian_8p(&p->list[x].v);
- }
- }
- }
- // load linear index
- fread(&index2->n, 4, 1, fp);
- if (bam_is_be) bam_swap_endian_4p(&index2->n);
- index2->m = index2->n;
- index2->offset = (uint64_t*)calloc(index2->m, 8);
- fread(index2->offset, index2->n, 8, fp);
- if (bam_is_be)
- for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
- }
- if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
- if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
- return idx;
-}
-
-bam_index_t *bam_index_load_local(const char *_fn)
-{
- FILE *fp;
- char *fnidx, *fn;
-
- if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
- const char *p;
- int l = strlen(_fn);
- for (p = _fn + l - 1; p >= _fn; --p)
- if (*p == '/') break;
- fn = strdup(p + 1);
- } else fn = strdup(_fn);
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".bai");
- fp = fopen(fnidx, "rb");
- if (fp == 0) { // try "{base}.bai"
- char *s = strstr(fn, "bam");
- if (s == fn + strlen(fn) - 3) {
- strcpy(fnidx, fn);
- fnidx[strlen(fn)-1] = 'i';
- fp = fopen(fnidx, "rb");
- }
- }
- free(fnidx); free(fn);
- if (fp) {
- bam_index_t *idx = bam_index_load_core(fp);
- fclose(fp);
- return idx;
- } else return 0;
-}
-
-#ifdef _USE_KNETFILE
-static void download_from_remote(const char *url)
-{
- const int buf_size = 1 * 1024 * 1024;
- char *fn;
- FILE *fp;
- uint8_t *buf;
- knetFile *fp_remote;
- int l;
- if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
- l = strlen(url);
- for (fn = (char*)url + l - 1; fn >= url; --fn)
- if (*fn == '/') break;
- ++fn; // fn now points to the file name
- fp_remote = knet_open(url, "r");
- if (fp_remote == 0) {
- fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
- return;
- }
- if ((fp = fopen(fn, "wb")) == 0) {
- fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
- knet_close(fp_remote);
- return;
- }
- buf = (uint8_t*)calloc(buf_size, 1);
- while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
- fwrite(buf, 1, l, fp);
- free(buf);
- fclose(fp);
- knet_close(fp_remote);
-}
-#else
-static void download_from_remote(const char *url)
-{
- return;
-}
-#endif
-
-bam_index_t *bam_index_load(const char *fn)
-{
- bam_index_t *idx;
- idx = bam_index_load_local(fn);
- if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
- char *fnidx = calloc(strlen(fn) + 5, 1);
- strcat(strcpy(fnidx, fn), ".bai");
- fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
- download_from_remote(fnidx);
- free(fnidx);
- idx = bam_index_load_local(fn);
- }
- if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
- return idx;
-}
-
-int bam_index_build2(const char *fn, const char *_fnidx)
-{
- char *fnidx;
- FILE *fpidx;
- bamFile fp;
- bam_index_t *idx;
- if ((fp = bam_open(fn, "r")) == 0) {
- fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
- return -1;
- }
- idx = bam_index_core(fp);
- bam_close(fp);
- if(idx == 0) {
- fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
- return -1;
- }
- if (_fnidx == 0) {
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".bai");
- } else fnidx = strdup(_fnidx);
- fpidx = fopen(fnidx, "wb");
- if (fpidx == 0) {
- fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
- free(fnidx);
- bam_index_destroy(idx);
- return -1;
- }
- bam_index_save(idx, fpidx);
- bam_index_destroy(idx);
- fclose(fpidx);
- free(fnidx);
- return 0;
-}
-
-int bam_index_build(const char *fn)
-{
- return bam_index_build2(fn, 0);
-}
-
-int bam_index(int argc, char *argv[])
-{
- if (argc < 2) {
- fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
- return 1;
- }
- if (argc >= 3) bam_index_build2(argv[1], argv[2]);
- else bam_index_build(argv[1]);
- return 0;
-}
-
-int bam_idxstats(int argc, char *argv[])
-{
- bam_index_t *idx;
- bam_header_t *header;
- bamFile fp;
- int i;
- if (argc < 2) {
- fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
- return 1;
- }
- fp = bam_open(argv[1], "r");
- if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
- header = bam_header_read(fp);
- bam_close(fp);
- idx = bam_index_load(argv[1]);
- if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
- for (i = 0; i < idx->n; ++i) {
- khint_t k;
- khash_t(i) *h = idx->index[i];
- printf("%s\t%d", header->target_name[i], header->target_len[i]);
- k = kh_get(i, h, BAM_MAX_BIN);
- if (k != kh_end(h))
- printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
- else printf("\t0\t0");
- putchar('\n');
- }
- printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
- bam_header_destroy(header);
- bam_index_destroy(idx);
- return 0;
-}
-
-static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
-{
- int i = 0, k;
- if (beg >= end) return 0;
- if (end >= 1u<<29) end = 1u<<29;
- --end;
- list[i++] = 0;
- for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
- for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
- for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
- for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
- for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
- return i;
-}
-
-static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
-{
- uint32_t rbeg = b->core.pos;
- uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
- return (rend > beg && rbeg < end);
-}
-
-struct __bam_iter_t {
- int from_first; // read from the first record; no random access
- int tid, beg, end, n_off, i, finished;
- uint64_t curr_off;
- pair64_t *off;
-};
-
-// bam_fetch helper function retrieves
-bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
-{
- uint16_t *bins;
- int i, n_bins, n_off;
- pair64_t *off;
- khint_t k;
- khash_t(i) *index;
- uint64_t min_off;
- bam_iter_t iter = 0;
-
- if (beg < 0) beg = 0;
- if (end < beg) return 0;
- // initialize iter
- iter = calloc(1, sizeof(struct __bam_iter_t));
- iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
- //
- bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
- n_bins = reg2bins(beg, end, bins);
- index = idx->index[tid];
- if (idx->index2[tid].n > 0) {
- min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
- : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
- if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
- int n = beg>>BAM_LIDX_SHIFT;
- if (n > idx->index2[tid].n) n = idx->index2[tid].n;
- for (i = n - 1; i >= 0; --i)
- if (idx->index2[tid].offset[i] != 0) break;
- if (i >= 0) min_off = idx->index2[tid].offset[i];
- }
- } else min_off = 0; // tabix 0.1.2 may produce such index files
- for (i = n_off = 0; i < n_bins; ++i) {
- if ((k = kh_get(i, index, bins[i])) != kh_end(index))
- n_off += kh_value(index, k).n;
- }
- if (n_off == 0) {
- free(bins); return iter;
- }
- off = (pair64_t*)calloc(n_off, 16);
- for (i = n_off = 0; i < n_bins; ++i) {
- if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
- int j;
- bam_binlist_t *p = &kh_value(index, k);
- for (j = 0; j < p->n; ++j)
- if (p->list[j].v > min_off) off[n_off++] = p->list[j];
- }
- }
- free(bins);
- if (n_off == 0) {
- free(off); return iter;
- }
- {
- bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
- int l;
- ks_introsort(off, n_off, off);
- // resolve completely contained adjacent blocks
- for (i = 1, l = 0; i < n_off; ++i)
- if (off[l].v < off[i].v)
- off[++l] = off[i];
- n_off = l + 1;
- // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
- for (i = 1; i < n_off; ++i)
- if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
- { // merge adjacent blocks
-#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
- for (i = 1, l = 0; i < n_off; ++i) {
-#ifdef BAM_TRUE_OFFSET
- if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
-#else
- if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
-#endif
- else off[++l] = off[i];
- }
- n_off = l + 1;
-#endif
- }
- bam_destroy1(b);
- }
- iter->n_off = n_off; iter->off = off;
- return iter;
-}
-
-pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
-{ // for pysam compatibility
- bam_iter_t iter;
- pair64_t *off;
- iter = bam_iter_query(idx, tid, beg, end);
- off = iter->off; *cnt_off = iter->n_off;
- free(iter);
- return off;
-}
-
-void bam_iter_destroy(bam_iter_t iter)
-{
- if (iter) { free(iter->off); free(iter); }
-}
-
-int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
-{
- int ret;
- if (iter && iter->finished) return -1;
- if (iter == 0 || iter->from_first) {
- ret = bam_read1(fp, b);
- if (ret < 0 && iter) iter->finished = 1;
- return ret;
- }
- if (iter->off == 0) return -1;
- for (;;) {
- if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
- if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
- if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
- if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
- bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
- iter->curr_off = bam_tell(fp);
- }
- ++iter->i;
- }
- if ((ret = bam_read1(fp, b)) >= 0) {
- iter->curr_off = bam_tell(fp);
- if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
- ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
- break;
- }
- else if (is_overlap(iter->beg, iter->end, b)) return ret;
- } else break; // end of file or error
- }
- iter->finished = 1;
- return ret;
-}
-
-int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
-{
- int ret;
- bam_iter_t iter;
- bam1_t *b;
- b = bam_init1();
- iter = bam_iter_query(idx, tid, beg, end);
- while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
- bam_iter_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
diff --git a/sam/bam_lpileup.c b/sam/bam_lpileup.c
deleted file mode 100644
index d4dd63b..0000000
--- a/sam/bam_lpileup.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <assert.h>
-#include "bam.h"
-#include "ksort.h"
-
-#define TV_GAP 2
-
-typedef struct __freenode_t {
- uint32_t level:28, cnt:4;
- struct __freenode_t *next;
-} freenode_t, *freenode_p;
-
-#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
-KSORT_INIT(node, freenode_p, freenode_lt)
-
-/* Memory pool, similar to the one in bam_pileup.c */
-typedef struct {
- int cnt, n, max;
- freenode_t **buf;
-} mempool_t;
-
-static mempool_t *mp_init()
-{
- return (mempool_t*)calloc(1, sizeof(mempool_t));
-}
-static void mp_destroy(mempool_t *mp)
-{
- int k;
- for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
- free(mp->buf); free(mp);
-}
-static inline freenode_t *mp_alloc(mempool_t *mp)
-{
- ++mp->cnt;
- if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
- else return mp->buf[--mp->n];
-}
-static inline void mp_free(mempool_t *mp, freenode_t *p)
-{
- --mp->cnt; p->next = 0; p->cnt = TV_GAP;
- if (mp->n == mp->max) {
- mp->max = mp->max? mp->max<<1 : 256;
- mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
- }
- mp->buf[mp->n++] = p;
-}
-
-/* core part */
-struct __bam_lplbuf_t {
- int max, n_cur, n_pre;
- int max_level, *cur_level, *pre_level;
- mempool_t *mp;
- freenode_t **aux, *head, *tail;
- int n_nodes, m_aux;
- bam_pileup_f func;
- void *user_data;
- bam_plbuf_t *plbuf;
-};
-
-void bam_lplbuf_reset(bam_lplbuf_t *buf)
-{
- freenode_t *p, *q;
- bam_plbuf_reset(buf->plbuf);
- for (p = buf->head; p->next;) {
- q = p->next;
- mp_free(buf->mp, p);
- p = q;
- }
- buf->head = buf->tail;
- buf->max_level = 0;
- buf->n_cur = buf->n_pre = 0;
- buf->n_nodes = 0;
-}
-
-static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
- freenode_t *p;
- int i, l, max_level;
- // allocate memory if necessary
- if (tv->max < n) { // enlarge
- tv->max = n;
- kroundup32(tv->max);
- tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
- tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
- }
- tv->n_cur = n;
- // update cnt
- for (p = tv->head; p->next; p = p->next)
- if (p->cnt > 0) --p->cnt;
- // calculate cur_level[]
- max_level = 0;
- for (i = l = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->is_head) {
- if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
- freenode_t *p = tv->head->next;
- tv->cur_level[i] = tv->head->level;
- mp_free(tv->mp, tv->head);
- tv->head = p;
- --tv->n_nodes;
- } else tv->cur_level[i] = ++tv->max_level;
- } else {
- tv->cur_level[i] = tv->pre_level[l++];
- if (p->is_tail) { // then return a free slot
- tv->tail->level = tv->cur_level[i];
- tv->tail->next = mp_alloc(tv->mp);
- tv->tail = tv->tail->next;
- ++tv->n_nodes;
- }
- }
- if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
- ((bam_pileup1_t*)p)->level = tv->cur_level[i];
- }
- assert(l == tv->n_pre);
- tv->func(tid, pos, n, pl, tv->user_data);
- // sort the linked list
- if (tv->n_nodes) {
- freenode_t *q;
- if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
- tv->m_aux = tv->n_nodes + 1;
- kroundup32(tv->m_aux);
- tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
- }
- for (p = tv->head, i = l = 0; p->next;) {
- if (p->level > max_level) { // then discard this entry
- q = p->next;
- mp_free(tv->mp, p);
- p = q;
- } else {
- tv->aux[i++] = p;
- p = p->next;
- }
- }
- tv->aux[i] = tv->tail; // add a proper tail for the loop below
- tv->n_nodes = i;
- if (tv->n_nodes) {
- ks_introsort(node, tv->n_nodes, tv->aux);
- for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
- tv->head = tv->aux[0];
- } else tv->head = tv->tail;
- }
- // clean up
- tv->max_level = max_level;
- memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
- // squeeze out terminated levels
- for (i = l = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (!p->is_tail)
- tv->pre_level[l++] = tv->pre_level[i];
- }
- tv->n_pre = l;
-/*
- fprintf(stderr, "%d\t", pos+1);
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->is_head) fprintf(stderr, "^");
- if (p->is_tail) fprintf(stderr, "$");
- fprintf(stderr, "%d,", p->level);
- }
- fprintf(stderr, "\n");
-*/
- return 0;
-}
-
-bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
-{
- bam_lplbuf_t *tv;
- tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
- tv->mp = mp_init();
- tv->head = tv->tail = mp_alloc(tv->mp);
- tv->func = func;
- tv->user_data = data;
- tv->plbuf = bam_plbuf_init(tview_func, tv);
- return (bam_lplbuf_t*)tv;
-}
-
-void bam_lplbuf_destroy(bam_lplbuf_t *tv)
-{
- freenode_t *p, *q;
- free(tv->cur_level); free(tv->pre_level);
- bam_plbuf_destroy(tv->plbuf);
- free(tv->aux);
- for (p = tv->head; p->next;) {
- q = p->next;
- mp_free(tv->mp, p); p = q;
- }
- mp_free(tv->mp, p);
- assert(tv->mp->cnt == 0);
- mp_destroy(tv->mp);
- free(tv);
-}
-
-int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
-{
- return bam_plbuf_push(b, tv->plbuf);
-}
diff --git a/sam/bam_mate.c b/sam/bam_mate.c
deleted file mode 100644
index b947c9d..0000000
--- a/sam/bam_mate.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include "kstring.h"
-#include "bam.h"
-
-void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
-{
- bam1_t *swap;
- int i, end;
- uint32_t *cigar;
- str->l = 0;
- if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip
- if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
- kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
- kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
- for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) {
- kputw(bam_cigar_oplen(cigar[i]), str);
- kputc(bam_cigar_opchr(cigar[i]), str);
- }
- end = bam_calend(&b1->core, cigar);
- kputw(b2->core.pos - end, str);
- kputc('T', str);
- kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
- kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
- for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) {
- kputw(bam_cigar_oplen(cigar[i]), str);
- kputc(bam_cigar_opchr(cigar[i]), str);
- }
- bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s);
-}
-
-// currently, this function ONLY works if each read has one hit
-void bam_mating_core(bamFile in, bamFile out, int remove_reads)
-{
- bam_header_t *header;
- bam1_t *b[2];
- int curr, has_prev, pre_end = 0, cur_end;
- kstring_t str;
-
- str.l = str.m = 0; str.s = 0;
- header = bam_header_read(in);
- bam_header_write(out, header);
-
- b[0] = bam_init1();
- b[1] = bam_init1();
- curr = 0; has_prev = 0;
- while (bam_read1(in, b[curr]) >= 0) {
- bam1_t *cur = b[curr], *pre = b[1-curr];
- if (cur->core.tid < 0)
- {
- if ( !remove_reads ) bam_write1(out, cur);
- continue;
- }
- cur_end = bam_calend(&cur->core, bam1_cigar(cur));
- if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP;
- if (cur->core.flag & BAM_FSECONDARY)
- {
- if ( !remove_reads ) bam_write1(out, cur);
- continue; // skip secondary alignments
- }
- if (has_prev) {
- if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
- cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
- pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
- if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
- && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE
- {
- uint32_t cur5, pre5;
- cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
- pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
- cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
- } else cur->core.isize = pre->core.isize = 0;
- if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
- else cur->core.flag &= ~BAM_FMREVERSE;
- if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
- else pre->core.flag &= ~BAM_FMREVERSE;
- if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
- if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
- bam_template_cigar(pre, cur, &str);
- bam_write1(out, pre);
- bam_write1(out, cur);
- has_prev = 0;
- } else { // unpaired or singleton
- pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
- if (pre->core.flag & BAM_FPAIRED) {
- pre->core.flag |= BAM_FMUNMAP;
- pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
- }
- bam_write1(out, pre);
- }
- } else has_prev = 1;
- curr = 1 - curr;
- pre_end = cur_end;
- }
- if (has_prev) bam_write1(out, b[1-curr]);
- bam_header_destroy(header);
- bam_destroy1(b[0]);
- bam_destroy1(b[1]);
- free(str.s);
-}
-
-void usage()
-{
- fprintf(stderr,"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
- fprintf(stderr,"Options:\n");
- fprintf(stderr," -r remove unmapped reads and secondary alignments\n");
- exit(1);
-}
-
-int bam_mating(int argc, char *argv[])
-{
- bamFile in, out;
- int c, remove_reads=0;
- while ((c = getopt(argc, argv, "r")) >= 0) {
- switch (c) {
- case 'r': remove_reads=1; break;
- }
- }
- if (optind+1 >= argc) usage();
- in = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
- out = (strcmp(argv[optind+1], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[optind+1], "w");
- bam_mating_core(in, out, remove_reads);
- bam_close(in); bam_close(out);
- return 0;
-}
-
-
diff --git a/sam/bam_md.c b/sam/bam_md.c
deleted file mode 100644
index ce40a12..0000000
--- a/sam/bam_md.c
+++ /dev/null
@@ -1,389 +0,0 @@
-#include <unistd.h>
-#include <assert.h>
-#include <string.h>
-#include <ctype.h>
-#include <math.h>
-#include "faidx.h"
-#include "sam.h"
-#include "kstring.h"
-#include "kaln.h"
-#include "kprobaln.h"
-
-#define USE_EQUAL 1
-#define DROP_TAG 2
-#define BIN_QUAL 4
-#define UPDATE_NM 8
-#define UPDATE_MD 16
-#define HASH_QNM 32
-
-char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-
-int bam_aux_drop_other(bam1_t *b, uint8_t *s);
-
-void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
-{
- uint8_t *seq = bam1_seq(b);
- uint32_t *cigar = bam1_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, u = 0;
- kstring_t *str;
- int32_t old_nm_i = -1, nm = 0;
-
- str = (kstring_t*)calloc(1, sizeof(kstring_t));
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
- if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
- if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
- ++u;
- } else {
- kputw(u, str); kputc(ref[x+j], str);
- u = 0; ++nm;
- }
- }
- if (j < l) break;
- x += l; y += l;
- } else if (op == BAM_CDEL) {
- kputw(u, str); kputc('^', str);
- for (j = 0; j < l; ++j) {
- if (ref[x+j] == 0) break;
- kputc(ref[x+j], str);
- }
- u = 0;
- if (j < l) break;
- x += l; nm += l;
- } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
- y += l;
- if (op == BAM_CINS) nm += l;
- } else if (op == BAM_CREF_SKIP) {
- x += l;
- }
- }
- kputw(u, str);
- // apply max_nm
- if (max_nm > 0 && nm >= max_nm) {
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
- if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
- seq[z/2] |= (z&1)? 0x0f : 0xf0;
- bam1_qual(b)[z] = 0;
- }
- }
- if (j < l) break;
- x += l; y += l;
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
- }
- }
- // update NM
- if (flag & UPDATE_NM) {
- uint8_t *old_nm = bam_aux_get(b, "NM");
- if (c->flag & BAM_FUNMAP) return;
- if (old_nm) old_nm_i = bam_aux2i(old_nm);
- if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
- else if (nm != old_nm_i) {
- fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
- bam_aux_del(b, old_nm);
- bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
- }
- }
- // update MD
- if (flag & UPDATE_MD) {
- uint8_t *old_md = bam_aux_get(b, "MD");
- if (c->flag & BAM_FUNMAP) return;
- if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
- else {
- int is_diff = 0;
- if (strlen((char*)old_md+1) == str->l) {
- for (i = 0; i < str->l; ++i)
- if (toupper(old_md[i+1]) != toupper(str->s[i]))
- break;
- if (i < str->l) is_diff = 1;
- } else is_diff = 1;
- if (is_diff) {
- fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
- bam_aux_del(b, old_md);
- bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
- }
- }
- }
- // drop all tags but RG
- if (flag&DROP_TAG) {
- uint8_t *q = bam_aux_get(b, "RG");
- bam_aux_drop_other(b, q);
- }
- // reduce the resolution of base quality
- if (flag&BIN_QUAL) {
- uint8_t *qual = bam1_qual(b);
- for (i = 0; i < b->core.l_qseq; ++i)
- if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
- }
- free(str->s); free(str);
-}
-
-void bam_fillmd1(bam1_t *b, char *ref, int flag)
-{
- bam_fillmd1_core(b, ref, flag, 0);
-}
-
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
-{
- uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
- uint32_t *cigar = bam1_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (ref[x+j] == 0) break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
- return (int)(t + .499);
-}
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam1_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (ref[i] == 0) { xe = i; break; }
- r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, 1);
-}
-
-int bam_fillmd(int argc, char *argv[])
-{
- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
- samfile_t *fp, *fpout = 0;
- faidx_t *fai;
- char *ref = 0, mode_w[8], mode_r[8];
- bam1_t *b;
-
- flt_flag = UPDATE_NM | UPDATE_MD;
- is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
- mode_w[0] = mode_r[0] = 0;
- strcpy(mode_r, "r"); strcpy(mode_w, "w");
- while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
- switch (c) {
- case 'r': is_realn = 1; break;
- case 'e': flt_flag |= USE_EQUAL; break;
- case 'd': flt_flag |= DROP_TAG; break;
- case 'q': flt_flag |= BIN_QUAL; break;
- case 'h': flt_flag |= HASH_QNM; break;
- case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
- case 'b': is_bam_out = 1; break;
- case 'u': is_uncompressed = is_bam_out = 1; break;
- case 'S': is_sam_in = 1; break;
- case 'n': max_nm = atoi(optarg); break;
- case 'C': capQ = atoi(optarg); break;
- case 'A': baq_flag |= 1; break;
- case 'E': baq_flag |= 2; break;
- default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
- }
- }
- if (!is_sam_in) strcat(mode_r, "b");
- if (is_bam_out) strcat(mode_w, "b");
- else strcat(mode_w, "h");
- if (is_uncompressed) strcat(mode_w, "u");
- if (optind + 1 >= argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
- fprintf(stderr, "Options: -e change identical bases to '='\n");
- fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
- fprintf(stderr, " -b compressed BAM output\n");
- fprintf(stderr, " -S the input is SAM with header\n");
- fprintf(stderr, " -A modify the quality string\n");
- fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
- fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n");
- return 1;
- }
- fp = samopen(argv[optind], mode_r, 0);
- if (fp == 0) return 1;
- if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
- fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
- return 1;
- }
- fpout = samopen("-", mode_w, fp->header);
- fai = fai_load(argv[optind+1]);
-
- b = bam_init1();
- while ((ret = samread(fp, b)) >= 0) {
- if (b->core.tid >= 0) {
- if (tid != b->core.tid) {
- free(ref);
- ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
- tid = b->core.tid;
- if (ref == 0)
- fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
- fp->header->target_name[tid]);
- }
- if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
- if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, capQ);
- if (b->core.qual > q) b->core.qual = q;
- }
- if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
- }
- samwrite(fpout, b);
- }
- bam_destroy1(b);
-
- free(ref);
- fai_destroy(fai);
- samclose(fp); samclose(fpout);
- return 0;
-}
diff --git a/sam/bam_pileup.c b/sam/bam_pileup.c
deleted file mode 100644
index 57434e0..0000000
--- a/sam/bam_pileup.c
+++ /dev/null
@@ -1,437 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <assert.h>
-#include "sam.h"
-
-typedef struct {
- int k, x, y, end;
-} cstate_t;
-
-static cstate_t g_cstate_null = { -1, 0, 0, 0 };
-
-typedef struct __linkbuf_t {
- bam1_t b;
- uint32_t beg, end;
- cstate_t s;
- struct __linkbuf_t *next;
-} lbnode_t;
-
-/* --- BEGIN: Memory pool */
-
-typedef struct {
- int cnt, n, max;
- lbnode_t **buf;
-} mempool_t;
-
-static mempool_t *mp_init()
-{
- mempool_t *mp;
- mp = (mempool_t*)calloc(1, sizeof(mempool_t));
- return mp;
-}
-static void mp_destroy(mempool_t *mp)
-{
- int k;
- for (k = 0; k < mp->n; ++k) {
- free(mp->buf[k]->b.data);
- free(mp->buf[k]);
- }
- free(mp->buf);
- free(mp);
-}
-static inline lbnode_t *mp_alloc(mempool_t *mp)
-{
- ++mp->cnt;
- if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
- else return mp->buf[--mp->n];
-}
-static inline void mp_free(mempool_t *mp, lbnode_t *p)
-{
- --mp->cnt; p->next = 0; // clear lbnode_t::next here
- if (mp->n == mp->max) {
- mp->max = mp->max? mp->max<<1 : 256;
- mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
- }
- mp->buf[mp->n++] = p;
-}
-
-/* --- END: Memory pool */
-
-/* --- BEGIN: Auxiliary functions */
-
-/* s->k: the index of the CIGAR operator that has just been processed.
- s->x: the reference coordinate of the start of s->k
- s->y: the query coordiante of the start of s->k
- */
-static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
-{
-#define _cop(c) ((c)&BAM_CIGAR_MASK)
-#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
-
- bam1_t *b = p->b;
- bam1_core_t *c = &b->core;
- uint32_t *cigar = bam1_cigar(b);
- int k, is_head = 0;
- // determine the current CIGAR operation
-// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y);
- if (s->k == -1) { // never processed
- is_head = 1;
- if (c->n_cigar == 1) { // just one operation, save a loop
- if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
- } else { // find the first match or deletion
- for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
- int op = _cop(cigar[k]);
- int l = _cln(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
- else if (op == BAM_CREF_SKIP) s->x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
- }
- assert(k < c->n_cigar);
- s->k = k;
- }
- } else { // the read has been processed before
- int op, l = _cln(cigar[s->k]);
- if (pos - s->x >= l) { // jump to the next operation
- assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
- op = _cop(cigar[s->k+1]);
- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
- if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
- s->x += l;
- ++s->k;
- } else { // find the next M/D/N/=/X
- if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
- s->x += l;
- for (k = s->k + 1; k < c->n_cigar; ++k) {
- op = _cop(cigar[k]), l = _cln(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
- }
- s->k = k;
- }
- assert(s->k < c->n_cigar); // otherwise a bug
- } // else, do nothing
- }
- { // collect pileup information
- int op, l;
- op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
- p->is_del = p->indel = p->is_refskip = 0;
- if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
- int op2 = _cop(cigar[s->k+1]);
- int l2 = _cln(cigar[s->k+1]);
- if (op2 == BAM_CDEL) p->indel = -(int)l2;
- else if (op2 == BAM_CINS) p->indel = l2;
- else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
- int l3 = 0;
- for (k = s->k + 2; k < c->n_cigar; ++k) {
- op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
- if (op2 == BAM_CINS) l3 += l2;
- else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
- }
- if (l3 > 0) p->indel = l3;
- }
- }
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- p->qpos = s->y + (pos - s->x);
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
- p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
- p->is_refskip = (op == BAM_CREF_SKIP);
- } // cannot be other operations; otherwise a bug
- p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
- }
- return 1;
-}
-
-/* --- END: Auxiliary functions */
-
-/*******************
- * pileup iterator *
- *******************/
-
-struct __bam_plp_t {
- mempool_t *mp;
- lbnode_t *head, *tail, *dummy;
- int32_t tid, pos, max_tid, max_pos;
- int is_eof, flag_mask, max_plp, error, maxcnt;
- bam_pileup1_t *plp;
- // for the "auto" interface only
- bam1_t *b;
- bam_plp_auto_f func;
- void *data;
-};
-
-bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
-{
- bam_plp_t iter;
- iter = calloc(1, sizeof(struct __bam_plp_t));
- iter->mp = mp_init();
- iter->head = iter->tail = mp_alloc(iter->mp);
- iter->dummy = mp_alloc(iter->mp);
- iter->max_tid = iter->max_pos = -1;
- iter->flag_mask = BAM_DEF_MASK;
- iter->maxcnt = 8000;
- if (func) {
- iter->func = func;
- iter->data = data;
- iter->b = bam_init1();
- }
- return iter;
-}
-
-void bam_plp_destroy(bam_plp_t iter)
-{
- mp_free(iter->mp, iter->dummy);
- mp_free(iter->mp, iter->head);
- if (iter->mp->cnt != 0)
- fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
- mp_destroy(iter->mp);
- if (iter->b) bam_destroy1(iter->b);
- free(iter->plp);
- free(iter);
-}
-
-const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
-{
- if (iter->error) { *_n_plp = -1; return 0; }
- *_n_plp = 0;
- if (iter->is_eof && iter->head->next == 0) return 0;
- while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
- int n_plp = 0;
- lbnode_t *p, *q;
- // write iter->plp at iter->pos
- iter->dummy->next = iter->head;
- for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
- if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
- q->next = p->next; mp_free(iter->mp, p); p = q;
- } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
- if (n_plp == iter->max_plp) { // then double the capacity
- iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
- iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
- }
- iter->plp[n_plp].b = &p->b;
- if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
- }
- }
- iter->head = iter->dummy->next; // dummy->next may be changed
- *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
- // update iter->tid and iter->pos
- if (iter->head->next) {
- if (iter->tid > iter->head->b.core.tid) {
- fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
- iter->error = 1;
- *_n_plp = -1;
- return 0;
- }
- }
- if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
- iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
- } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
- iter->pos = iter->head->beg; // jump to the next position
- } else ++iter->pos; // scan contiguously
- // return
- if (n_plp) return iter->plp;
- if (iter->is_eof && iter->head->next == 0) break;
- }
- return 0;
-}
-
-int bam_plp_push(bam_plp_t iter, const bam1_t *b)
-{
- if (iter->error) return -1;
- if (b) {
- if (b->core.tid < 0) return 0;
- if (b->core.flag & iter->flag_mask) return 0;
- if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
- bam_copy1(&iter->tail->b, b);
- iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
- iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
- if (b->core.tid < iter->max_tid) {
- fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
- iter->error = 1;
- return -1;
- }
- if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
- fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
- iter->error = 1;
- return -1;
- }
- iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
- if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
- iter->tail->next = mp_alloc(iter->mp);
- iter->tail = iter->tail->next;
- }
- } else iter->is_eof = 1;
- return 0;
-}
-
-const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
-{
- const bam_pileup1_t *plp;
- if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
- if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
- else { // no pileup line can be obtained; read alignments
- *_n_plp = 0;
- if (iter->is_eof) return 0;
- while (iter->func(iter->data, iter->b) >= 0) {
- if (bam_plp_push(iter, iter->b) < 0) {
- *_n_plp = -1;
- return 0;
- }
- if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
- // otherwise no pileup line can be returned; read the next alignment.
- }
- bam_plp_push(iter, 0);
- if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
- return 0;
- }
-}
-
-void bam_plp_reset(bam_plp_t iter)
-{
- lbnode_t *p, *q;
- iter->max_tid = iter->max_pos = -1;
- iter->tid = iter->pos = 0;
- iter->is_eof = 0;
- for (p = iter->head; p->next;) {
- q = p->next;
- mp_free(iter->mp, p);
- p = q;
- }
- iter->head = iter->tail;
-}
-
-void bam_plp_set_mask(bam_plp_t iter, int mask)
-{
- iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
-}
-
-void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
-{
- iter->maxcnt = maxcnt;
-}
-
-/*****************
- * callback APIs *
- *****************/
-
-int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
-{
- bam_plbuf_t *buf;
- int ret;
- bam1_t *b;
- b = bam_init1();
- buf = bam_plbuf_init(func, func_data);
- bam_plbuf_set_mask(buf, mask);
- while ((ret = bam_read1(fp, b)) >= 0)
- bam_plbuf_push(b, buf);
- bam_plbuf_push(0, buf);
- bam_plbuf_destroy(buf);
- bam_destroy1(b);
- return 0;
-}
-
-void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
-{
- bam_plp_set_mask(buf->iter, mask);
-}
-
-void bam_plbuf_reset(bam_plbuf_t *buf)
-{
- bam_plp_reset(buf->iter);
-}
-
-bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
-{
- bam_plbuf_t *buf;
- buf = calloc(1, sizeof(bam_plbuf_t));
- buf->iter = bam_plp_init(0, 0);
- buf->func = func;
- buf->data = data;
- return buf;
-}
-
-void bam_plbuf_destroy(bam_plbuf_t *buf)
-{
- bam_plp_destroy(buf->iter);
- free(buf);
-}
-
-int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
-{
- int ret, n_plp, tid, pos;
- const bam_pileup1_t *plp;
- ret = bam_plp_push(buf->iter, b);
- if (ret < 0) return ret;
- while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
- buf->func(tid, pos, n_plp, plp, buf->data);
- return 0;
-}
-
-/***********
- * mpileup *
- ***********/
-
-struct __bam_mplp_t {
- int n;
- uint64_t min, *pos;
- bam_plp_t *iter;
- int *n_plp;
- const bam_pileup1_t **plp;
-};
-
-bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
-{
- int i;
- bam_mplp_t iter;
- iter = calloc(1, sizeof(struct __bam_mplp_t));
- iter->pos = calloc(n, 8);
- iter->n_plp = calloc(n, sizeof(int));
- iter->plp = calloc(n, sizeof(void*));
- iter->iter = calloc(n, sizeof(void*));
- iter->n = n;
- iter->min = (uint64_t)-1;
- for (i = 0; i < n; ++i) {
- iter->iter[i] = bam_plp_init(func, data[i]);
- iter->pos[i] = iter->min;
- }
- return iter;
-}
-
-void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
-{
- int i;
- for (i = 0; i < iter->n; ++i)
- iter->iter[i]->maxcnt = maxcnt;
-}
-
-void bam_mplp_destroy(bam_mplp_t iter)
-{
- int i;
- for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
- free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
- free(iter);
-}
-
-int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
-{
- int i, ret = 0;
- uint64_t new_min = (uint64_t)-1;
- for (i = 0; i < iter->n; ++i) {
- if (iter->pos[i] == iter->min) {
- int tid, pos;
- iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
- iter->pos[i] = (uint64_t)tid<<32 | pos;
- }
- if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
- }
- iter->min = new_min;
- if (new_min == (uint64_t)-1) return 0;
- *_tid = new_min>>32; *_pos = (uint32_t)new_min;
- for (i = 0; i < iter->n; ++i) {
- if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
- n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
- ++ret;
- } else n_plp[i] = 0, plp[i] = 0;
- }
- return ret;
-}
diff --git a/sam/bam_plcmd.c b/sam/bam_plcmd.c
deleted file mode 100644
index 54a4597..0000000
--- a/sam/bam_plcmd.c
+++ /dev/null
@@ -1,606 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/stat.h>
-#include <getopt.h>
-#include "sam.h"
-#include "faidx.h"
-#include "kstring.h"
-#include "sam_header.h"
-
-static inline int printw(int c, FILE *fp)
-{
- char buf[16];
- int l, x;
- if (c == 0) return fputc('0', fp);
- for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
- if (c < 0) buf[l++] = '-';
- buf[l] = 0;
- for (x = 0; x < l/2; ++x) {
- int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y;
- }
- fputs(buf, fp);
- return 0;
-}
-
-static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
-{
- int j;
- if (p->is_head) {
- putchar('^');
- putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33);
- }
- if (!p->is_del) {
- int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
- if (ref) {
- int rb = pos < ref_len? ref[pos] : 'N';
- if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.';
- else c = bam1_strand(p->b)? tolower(c) : toupper(c);
- } else {
- if (c == '=') c = bam1_strand(p->b)? ',' : '.';
- else c = bam1_strand(p->b)? tolower(c) : toupper(c);
- }
- putchar(c);
- } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*');
- if (p->indel > 0) {
- putchar('+'); printw(p->indel, stdout);
- for (j = 1; j <= p->indel; ++j) {
- int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
- putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
- }
- } else if (p->indel < 0) {
- printw(p->indel, stdout);
- for (j = 1; j <= -p->indel; ++j) {
- int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
- putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
- }
- }
- if (p->is_tail) putchar('$');
-}
-
-#include <assert.h>
-#include "bam2bcf.h"
-#include "sample.h"
-
-#define MPLP_GLF 0x10
-#define MPLP_NO_COMP 0x20
-#define MPLP_NO_ORPHAN 0x40
-#define MPLP_REALN 0x80
-#define MPLP_NO_INDEL 0x400
-#define MPLP_REDO_BAQ 0x800
-#define MPLP_ILLUMINA13 0x1000
-#define MPLP_IGNORE_RG 0x2000
-#define MPLP_PRINT_POS 0x4000
-#define MPLP_PRINT_MAPQ 0x8000
-#define MPLP_PER_SAMPLE 0x10000
-
-void *bed_read(const char *fn);
-void bed_destroy(void *_h);
-int bed_overlap(const void *_h, const char *chr, int beg, int end);
-
-typedef struct {
- int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
- int rflag_require, rflag_filter;
- int openQ, extQ, tandemQ, min_support; // for indels
- double min_frac; // for indels
- char *reg, *pl_list, *fai_fname;
- faidx_t *fai;
- void *bed, *rghash;
-} mplp_conf_t;
-
-typedef struct {
- bamFile fp;
- bam_iter_t iter;
- bam_header_t *h;
- int ref_id;
- char *ref;
- const mplp_conf_t *conf;
-} mplp_aux_t;
-
-typedef struct {
- int n;
- int *n_plp, *m_plp;
- bam_pileup1_t **plp;
-} mplp_pileup_t;
-
-static int mplp_func(void *data, bam1_t *b)
-{
- extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
- mplp_aux_t *ma = (mplp_aux_t*)data;
- int ret, skip = 0;
- do {
- int has_ref;
- ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
- if (ret < 0) break;
- if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads
- skip = 1;
- continue;
- }
- if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
- if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
- if (ma->conf->bed) { // test overlap
- skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
- if (skip) continue;
- }
- if (ma->conf->rghash) { // exclude read groups
- uint8_t *rg = bam_aux_get(b, "RG");
- skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0);
- if (skip) continue;
- }
- if (ma->conf->flag & MPLP_ILLUMINA13) {
- int i;
- uint8_t *qual = bam1_qual(b);
- for (i = 0; i < b->core.l_qseq; ++i)
- qual[i] = qual[i] > 31? qual[i] - 31 : 0;
- }
- has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
- skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
- if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres);
- if (q < 0) skip = 1;
- else if (b->core.qual > q) b->core.qual = q;
- }
- else if (b->core.qual < ma->conf->min_mq) skip = 1;
- else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1;
- } while (skip);
- return ret;
-}
-
-static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
- int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg)
-{
- int i, j;
- memset(m->n_plp, 0, m->n * sizeof(int));
- for (i = 0; i < n; ++i) {
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- uint8_t *q;
- int id = -1;
- q = ignore_rg? 0 : bam_aux_get(p->b, "RG");
- if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf);
- if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
- if (id < 0 || id >= m->n) {
- assert(q); // otherwise a bug
- fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
- exit(1);
- }
- if (m->n_plp[id] == m->m_plp[id]) {
- m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
- m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
- }
- m->plp[id][m->n_plp[id]++] = *p;
- }
- }
-}
-
-static int mpileup(mplp_conf_t *conf, int n, char **fn)
-{
- extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
- extern void bcf_call_del_rghash(void *rghash);
- mplp_aux_t **data;
- int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
- const bam_pileup1_t **plp;
- bam_mplp_t iter;
- bam_header_t *h = 0;
- char *ref;
- void *rghash = 0;
-
- bcf_callaux_t *bca = 0;
- bcf_callret1_t *bcr = 0;
- bcf_call_t bc;
- bcf_t *bp = 0;
- bcf_hdr_t *bh = 0;
-
- bam_sample_t *sm = 0;
- kstring_t buf;
- mplp_pileup_t gplp;
-
- memset(&gplp, 0, sizeof(mplp_pileup_t));
- memset(&buf, 0, sizeof(kstring_t));
- memset(&bc, 0, sizeof(bcf_call_t));
- data = calloc(n, sizeof(void*));
- plp = calloc(n, sizeof(void*));
- n_plp = calloc(n, sizeof(int*));
- sm = bam_smpl_init();
-
- // read the header and initialize data
- for (i = 0; i < n; ++i) {
- bam_header_t *h_tmp;
- data[i] = calloc(1, sizeof(mplp_aux_t));
- data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
- if ( !data[i]->fp )
- {
- fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
- exit(1);
- }
- data[i]->conf = conf;
- h_tmp = bam_header_read(data[i]->fp);
- if ( !h_tmp ) {
- fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
- exit(1);
- }
- data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
- bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
- rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
- if (conf->reg) {
- int beg, end;
- bam_index_t *idx;
- idx = bam_index_load(fn[i]);
- if (idx == 0) {
- fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
- exit(1);
- }
- if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
- fprintf(stderr, "[%s] malformatted region or wrong seqname for %s\n", __func__, fn[i]);
- exit(1);
- }
- if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
- data[i]->iter = bam_iter_query(idx, tid, beg, end);
- bam_index_destroy(idx);
- }
- if (i == 0) h = h_tmp;
- else {
- // FIXME: to check consistency
- bam_header_destroy(h_tmp);
- }
- }
- gplp.n = sm->n;
- gplp.n_plp = calloc(sm->n, sizeof(int));
- gplp.m_plp = calloc(sm->n, sizeof(int));
- gplp.plp = calloc(sm->n, sizeof(void*));
-
- fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
- // write the VCF header
- if (conf->flag & MPLP_GLF) {
- kstring_t s;
- bh = calloc(1, sizeof(bcf_hdr_t));
- s.l = s.m = 0; s.s = 0;
- bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
- for (i = 0; i < h->n_targets; ++i) {
- kputs(h->target_name[i], &s);
- kputc('\0', &s);
- }
- bh->l_nm = s.l;
- bh->name = malloc(s.l);
- memcpy(bh->name, s.s, s.l);
- s.l = 0;
- for (i = 0; i < sm->n; ++i) {
- kputs(sm->smpl[i], &s); kputc('\0', &s);
- }
- bh->l_smpl = s.l;
- bh->sname = malloc(s.l);
- memcpy(bh->sname, s.s, s.l);
- s.l = 0;
- ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION);
- if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname);
- h->dict = sam_header_parse2(h->text);
- int nseq;
- const char *tags[] = {"SN","LN","UR","M5",NULL};
- char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq);
- for (i=0; i<nseq; i++)
- {
- ksprintf(&s, "##contig=<ID=%s", tbl[4*i]);
- if ( tbl[4*i+1] ) ksprintf(&s, ",length=%s", tbl[4*i+1]);
- if ( tbl[4*i+2] ) ksprintf(&s, ",URL=%s", tbl[4*i+2]);
- if ( tbl[4*i+3] ) ksprintf(&s, ",md5=%s", tbl[4*i+3]);
- kputs(">\n", &s);
- }
- if (tbl) free(tbl);
- bh->txt = s.s;
- bh->l_txt = 1 + s.l;
- bcf_hdr_sync(bh);
- bcf_hdr_write(bp, bh);
- bca = bcf_call_init(-1., conf->min_baseQ);
- bcr = calloc(sm->n, sizeof(bcf_callret1_t));
- bca->rghash = rghash;
- bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
- bca->min_frac = conf->min_frac;
- bca->min_support = conf->min_support;
- bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
- }
- if (tid0 >= 0 && conf->fai) { // region is set
- ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
- ref_tid = tid0;
- for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
- } else ref_tid = -1, ref = 0;
- iter = bam_mplp_init(n, mplp_func, (void**)data);
- max_depth = conf->max_depth;
- if (max_depth * sm->n > 1<<20)
- fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
- if (max_depth * sm->n < 8000) {
- max_depth = 8000 / sm->n;
- fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
- }
- max_indel_depth = conf->max_indel_depth * sm->n;
- bam_mplp_set_maxcnt(iter, max_depth);
- while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
- if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
- if (tid != ref_tid) {
- free(ref); ref = 0;
- if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
- for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
- ref_tid = tid;
- }
- if (conf->flag & MPLP_GLF) {
- int total_depth, _ref0, ref16;
- bcf1_t *b = calloc(1, sizeof(bcf1_t));
- for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
- group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
- _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
- ref16 = bam_nt16_table[_ref0];
- for (i = 0; i < gplp.n; ++i)
- bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
- bcf_call_combine(gplp.n, bcr, bca, ref16, &bc);
- bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0);
- bcf_write(bp, bh, b);
- bcf_destroy(b);
- // call indels
- if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
- for (i = 0; i < gplp.n; ++i)
- bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
- if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) {
- b = calloc(1, sizeof(bcf1_t));
- bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref);
- bcf_write(bp, bh, b);
- bcf_destroy(b);
- }
- }
- } else {
- printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
- for (i = 0; i < n; ++i) {
- int j, cnt;
- for (j = cnt = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt;
- }
- printf("\t%d\t", cnt);
- if (n_plp[i] == 0) {
- printf("*\t*"); // FIXME: printf() is very slow...
- if (conf->flag & MPLP_PRINT_POS) printf("\t*");
- } else {
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ)
- pileup_seq(plp[i] + j, pos, ref_len, ref);
- }
- putchar('\t');
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- int c = bam1_qual(p->b)[p->qpos];
- if (c >= conf->min_baseQ) {
- c = c + 33 < 126? c + 33 : 126;
- putchar(c);
- }
- }
- if (conf->flag & MPLP_PRINT_MAPQ) {
- putchar('\t');
- for (j = 0; j < n_plp[i]; ++j) {
- int c = plp[i][j].b->core.qual + 33;
- if (c > 126) c = 126;
- putchar(c);
- }
- }
- if (conf->flag & MPLP_PRINT_POS) {
- putchar('\t');
- for (j = 0; j < n_plp[i]; ++j) {
- if (j > 0) putchar(',');
- printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
- }
- }
- }
- }
- putchar('\n');
- }
- }
-
- bcf_close(bp);
- bam_smpl_destroy(sm); free(buf.s);
- for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
- free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
- bcf_call_del_rghash(rghash);
- bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
- bam_mplp_destroy(iter);
- bam_header_destroy(h);
- for (i = 0; i < n; ++i) {
- bam_close(data[i]->fp);
- if (data[i]->iter) bam_iter_destroy(data[i]->iter);
- free(data[i]);
- }
- free(data); free(plp); free(ref); free(n_plp);
- return 0;
-}
-
-#define MAX_PATH_LEN 1024
-int read_file_list(const char *file_list,int *n,char **argv[])
-{
- char buf[MAX_PATH_LEN];
- int len, nfiles = 0;
- char **files = NULL;
- struct stat sb;
-
- *n = 0;
- *argv = NULL;
-
- FILE *fh = fopen(file_list,"r");
- if ( !fh )
- {
- fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
- return 1;
- }
-
- files = calloc(nfiles,sizeof(char*));
- nfiles = 0;
- while ( fgets(buf,MAX_PATH_LEN,fh) )
- {
- // allow empty lines and trailing spaces
- len = strlen(buf);
- while ( len>0 && isspace(buf[len-1]) ) len--;
- if ( !len ) continue;
-
- // check sanity of the file list
- buf[len] = 0;
- if (stat(buf, &sb) != 0)
- {
- // no such file, check if it is safe to print its name
- int i, safe_to_print = 1;
- for (i=0; i<len; i++)
- if (!isprint(buf[i])) { safe_to_print = 0; break; }
- if ( safe_to_print )
- fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
- else
- fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
- return 1;
- }
-
- nfiles++;
- files = realloc(files,nfiles*sizeof(char*));
- files[nfiles-1] = strdup(buf);
- }
- fclose(fh);
- if ( !nfiles )
- {
- fprintf(stderr,"No files read from %s\n", file_list);
- return 1;
- }
- *argv = files;
- *n = nfiles;
- return 0;
-}
-#undef MAX_PATH_LEN
-
-int bam_mpileup(int argc, char *argv[])
-{
- int c;
- const char *file_list = NULL;
- char **fn = NULL;
- int nfiles = 0, use_orphan = 0;
- mplp_conf_t mplp;
- memset(&mplp, 0, sizeof(mplp_conf_t));
- mplp.max_mq = 60;
- mplp.min_baseQ = 13;
- mplp.capQ_thres = 0;
- mplp.max_depth = 250; mplp.max_indel_depth = 250;
- mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
- mplp.min_frac = 0.002; mplp.min_support = 1;
- mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- static struct option lopts[] =
- {
- {"rf",1,0,1}, // require flag
- {"ff",1,0,2}, // filter flag
- {0,0,0,0}
- };
- while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) {
- switch (c) {
- case 1 : mplp.rflag_require = strtol(optarg,0,0); break;
- case 2 : mplp.rflag_filter = strtol(optarg,0,0); break;
- case 'f':
- mplp.fai = fai_load(optarg);
- if (mplp.fai == 0) return 1;
- mplp.fai_fname = optarg;
- break;
- case 'd': mplp.max_depth = atoi(optarg); break;
- case 'r': mplp.reg = strdup(optarg); break;
- case 'l': mplp.bed = bed_read(optarg); break;
- case 'P': mplp.pl_list = strdup(optarg); break;
- case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
- case 'g': mplp.flag |= MPLP_GLF; break;
- case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break;
- case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break;
- case 'B': mplp.flag &= ~MPLP_REALN; break;
- case 'D': mplp.fmt_flag |= B2B_FMT_DP; break;
- case 'S': mplp.fmt_flag |= B2B_FMT_SP; break;
- case 'V': mplp.fmt_flag |= B2B_FMT_DV; break;
- case 'I': mplp.flag |= MPLP_NO_INDEL; break;
- case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
- case '6': mplp.flag |= MPLP_ILLUMINA13; break;
- case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
- case 's': mplp.flag |= MPLP_PRINT_MAPQ; break;
- case 'O': mplp.flag |= MPLP_PRINT_POS; break;
- case 'C': mplp.capQ_thres = atoi(optarg); break;
- case 'M': mplp.max_mq = atoi(optarg); break;
- case 'q': mplp.min_mq = atoi(optarg); break;
- case 'Q': mplp.min_baseQ = atoi(optarg); break;
- case 'b': file_list = optarg; break;
- case 'o': mplp.openQ = atoi(optarg); break;
- case 'e': mplp.extQ = atoi(optarg); break;
- case 'h': mplp.tandemQ = atoi(optarg); break;
- case 'A': use_orphan = 1; break;
- case 'F': mplp.min_frac = atof(optarg); break;
- case 'm': mplp.min_support = atoi(optarg); break;
- case 'L': mplp.max_indel_depth = atoi(optarg); break;
- case 'G': {
- FILE *fp_rg;
- char buf[1024];
- mplp.rghash = bcf_str2id_init();
- if ((fp_rg = fopen(optarg, "r")) == 0)
- fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
- while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
- bcf_str2id_add(mplp.rghash, strdup(buf));
- fclose(fp_rg);
- }
- break;
- }
- }
- if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
- if (argc == 1) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
- fprintf(stderr, "Input options:\n\n");
- fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n");
- fprintf(stderr, " -A count anomalous read pairs\n");
- fprintf(stderr, " -B disable BAQ computation\n");
- fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n");
- fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
- fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n");
- fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n");
- fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
- fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n");
- fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
- fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
- fprintf(stderr, " -R ignore RG tags\n");
- fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
- fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
- fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n");
- fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n");
- fprintf(stderr, "\nOutput options:\n\n");
- fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n");
- fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n");
- fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n");
- fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n");
- fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n");
- fprintf(stderr, " -u generate uncompress BCF output\n");
- fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n");
- fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
- fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
- fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
- fprintf(stderr, " -I do not perform indel calling\n");
- fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
- fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
- fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
- fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n");
- fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
- return 1;
- }
- bam_no_B = 1;
- if (file_list) {
- if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
- mpileup(&mplp,nfiles,fn);
- for (c=0; c<nfiles; c++) free(fn[c]);
- free(fn);
- } else mpileup(&mplp, argc - optind, argv + optind);
- if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash);
- free(mplp.reg); free(mplp.pl_list);
- if (mplp.fai) fai_destroy(mplp.fai);
- if (mplp.bed) bed_destroy(mplp.bed);
- return 0;
-}
diff --git a/sam/bam_reheader.c b/sam/bam_reheader.c
deleted file mode 100644
index 6619428..0000000
--- a/sam/bam_reheader.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "knetfile.h"
-#include "bgzf.h"
-#include "bam.h"
-
-#define BUF_SIZE 0x10000
-
-int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
-{
- BGZF *fp;
- bam_header_t *old;
- int len;
- uint8_t *buf;
- if (in->is_write) return -1;
- buf = malloc(BUF_SIZE);
- old = bam_header_read(in);
- fp = bgzf_fdopen(fd, "w");
- bam_header_write(fp, h);
- if (in->block_offset < in->block_length) {
- bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
- bgzf_flush(fp);
- }
-#ifdef _USE_KNETFILE
- while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0)
- fwrite(buf, 1, len, fp->fp);
-#else
- while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0)
- fwrite(buf, 1, len, fp->file);
-#endif
- free(buf);
- fp->block_offset = in->block_offset = 0;
- bgzf_close(fp);
- return 0;
-}
-
-int main_reheader(int argc, char *argv[])
-{
- bam_header_t *h;
- BGZF *in;
- if (argc != 3) {
- fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
- return 1;
- }
- { // read the header
- tamFile fph = sam_open(argv[1]);
- if (fph == 0) {
- fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
- return 1;
- }
- h = sam_header_read(fph);
- sam_close(fph);
- }
- in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r");
- if (in == 0) {
- fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
- return 1;
- }
- bam_reheader(in, h, fileno(stdout));
- bgzf_close(in);
- return 0;
-}
diff --git a/sam/bam_rmdup.c b/sam/bam_rmdup.c
deleted file mode 100644
index f0d2b5d..0000000
--- a/sam/bam_rmdup.c
+++ /dev/null
@@ -1,206 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <zlib.h>
-#include <unistd.h>
-#include "sam.h"
-
-typedef bam1_t *bam1_p;
-
-#include "khash.h"
-KHASH_SET_INIT_STR(name)
-KHASH_MAP_INIT_INT64(pos, bam1_p)
-
-#define BUFFER_SIZE 0x40000
-
-typedef struct {
- uint64_t n_checked, n_removed;
- khash_t(pos) *best_hash;
-} lib_aux_t;
-KHASH_MAP_INIT_STR(lib, lib_aux_t)
-
-typedef struct {
- int n, max;
- bam1_t **a;
-} tmp_stack_t;
-
-static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
-{
- if (stack->n == stack->max) {
- stack->max = stack->max? stack->max<<1 : 0x10000;
- stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
- }
- stack->a[stack->n++] = b;
-}
-
-static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
-{
- int i;
- for (i = 0; i != stack->n; ++i) {
- samwrite(out, stack->a[i]);
- bam_destroy1(stack->a[i]);
- }
- stack->n = 0;
-}
-
-static void clear_del_set(khash_t(name) *del_set)
-{
- khint_t k;
- for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
- if (kh_exist(del_set, k))
- free((char*)kh_key(del_set, k));
- kh_clear(name, del_set);
-}
-
-static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
-{
- khint_t k = kh_get(lib, aux, lib);
- if (k == kh_end(aux)) {
- int ret;
- char *p = strdup(lib);
- lib_aux_t *q;
- k = kh_put(lib, aux, p, &ret);
- q = &kh_val(aux, k);
- q->n_checked = q->n_removed = 0;
- q->best_hash = kh_init(pos);
- return q;
- } else return &kh_val(aux, k);
-}
-
-static void clear_best(khash_t(lib) *aux, int max)
-{
- khint_t k;
- for (k = kh_begin(aux); k != kh_end(aux); ++k) {
- if (kh_exist(aux, k)) {
- lib_aux_t *q = &kh_val(aux, k);
- if (kh_size(q->best_hash) >= max)
- kh_clear(pos, q->best_hash);
- }
- }
-}
-
-static inline int sum_qual(const bam1_t *b)
-{
- int i, q;
- uint8_t *qual = bam1_qual(b);
- for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
- return q;
-}
-
-void bam_rmdup_core(samfile_t *in, samfile_t *out)
-{
- bam1_t *b;
- int last_tid = -1, last_pos = -1;
- tmp_stack_t stack;
- khint_t k;
- khash_t(lib) *aux;
- khash_t(name) *del_set;
-
- aux = kh_init(lib);
- del_set = kh_init(name);
- b = bam_init1();
- memset(&stack, 0, sizeof(tmp_stack_t));
-
- kh_resize(name, del_set, 4 * BUFFER_SIZE);
- while (samread(in, b) >= 0) {
- bam1_core_t *c = &b->core;
- if (c->tid != last_tid || last_pos != c->pos) {
- dump_best(&stack, out); // write the result
- clear_best(aux, BUFFER_SIZE);
- if (c->tid != last_tid) {
- clear_best(aux, 0);
- if (kh_size(del_set)) { // check
- fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
- clear_del_set(del_set);
- }
- if ((int)c->tid == -1) { // append unmapped reads
- samwrite(out, b);
- while (samread(in, b) >= 0) samwrite(out, b);
- break;
- }
- last_tid = c->tid;
- fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
- }
- }
- if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
- samwrite(out, b);
- } else if (c->isize > 0) { // paired, head
- uint64_t key = (uint64_t)c->pos<<32 | c->isize;
- const char *lib;
- lib_aux_t *q;
- int ret;
- lib = bam_get_library(in->header, b);
- q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
- ++q->n_checked;
- k = kh_put(pos, q->best_hash, key, &ret);
- if (ret == 0) { // found in best_hash
- bam1_t *p = kh_val(q->best_hash, k);
- ++q->n_removed;
- if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
- kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
- bam_copy1(p, b); // replaced as b
- } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
- if (ret == 0)
- fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
- } else { // not found in best_hash
- kh_val(q->best_hash, k) = bam_dup1(b);
- stack_insert(&stack, kh_val(q->best_hash, k));
- }
- } else { // paired, tail
- k = kh_get(name, del_set, bam1_qname(b));
- if (k != kh_end(del_set)) {
- free((char*)kh_key(del_set, k));
- kh_del(name, del_set, k);
- } else samwrite(out, b);
- }
- last_pos = c->pos;
- }
-
- for (k = kh_begin(aux); k != kh_end(aux); ++k) {
- if (kh_exist(aux, k)) {
- lib_aux_t *q = &kh_val(aux, k);
- dump_best(&stack, out);
- fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
- (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
- kh_destroy(pos, q->best_hash);
- free((char*)kh_key(aux, k));
- }
- }
- kh_destroy(lib, aux);
-
- clear_del_set(del_set);
- kh_destroy(name, del_set);
- free(stack.a);
- bam_destroy1(b);
-}
-
-void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
-
-int bam_rmdup(int argc, char *argv[])
-{
- int c, is_se = 0, force_se = 0;
- samfile_t *in, *out;
- while ((c = getopt(argc, argv, "sS")) >= 0) {
- switch (c) {
- case 's': is_se = 1; break;
- case 'S': force_se = is_se = 1; break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
- fprintf(stderr, "Option: -s rmdup for SE reads\n");
- fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n");
- return 1;
- }
- in = samopen(argv[optind], "rb", 0);
- out = samopen(argv[optind+1], "wb", in->header);
- if (in == 0 || out == 0) {
- fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
- return 1;
- }
- if (is_se) bam_rmdupse_core(in, out, force_se);
- else bam_rmdup_core(in, out);
- samclose(in); samclose(out);
- return 0;
-}
diff --git a/sam/bam_rmdupse.c b/sam/bam_rmdupse.c
deleted file mode 100644
index e7dbdc7..0000000
--- a/sam/bam_rmdupse.c
+++ /dev/null
@@ -1,159 +0,0 @@
-#include <math.h>
-#include "sam.h"
-#include "khash.h"
-#include "klist.h"
-
-#define QUEUE_CLEAR_SIZE 0x100000
-#define MAX_POS 0x7fffffff
-
-typedef struct {
- int endpos;
- uint32_t score:31, discarded:1;
- bam1_t *b;
-} elem_t, *elem_p;
-#define __free_elem(p) bam_destroy1((p)->data.b)
-KLIST_INIT(q, elem_t, __free_elem)
-typedef klist_t(q) queue_t;
-
-KHASH_MAP_INIT_INT(best, elem_p)
-typedef khash_t(best) besthash_t;
-
-typedef struct {
- uint64_t n_checked, n_removed;
- besthash_t *left, *rght;
-} lib_aux_t;
-KHASH_MAP_INIT_STR(lib, lib_aux_t)
-
-static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
-{
- khint_t k = kh_get(lib, aux, lib);
- if (k == kh_end(aux)) {
- int ret;
- char *p = strdup(lib);
- lib_aux_t *q;
- k = kh_put(lib, aux, p, &ret);
- q = &kh_val(aux, k);
- q->left = kh_init(best);
- q->rght = kh_init(best);
- q->n_checked = q->n_removed = 0;
- return q;
- } else return &kh_val(aux, k);
-}
-
-static inline int sum_qual(const bam1_t *b)
-{
- int i, q;
- uint8_t *qual = bam1_qual(b);
- for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
- return q;
-}
-
-static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
-{
- elem_t *p = kl_pushp(q, queue);
- p->discarded = 0;
- p->endpos = endpos; p->score = score;
- if (p->b == 0) p->b = bam_init1();
- bam_copy1(p->b, b);
- return p;
-}
-
-static void clear_besthash(besthash_t *h, int32_t pos)
-{
- khint_t k;
- for (k = kh_begin(h); k != kh_end(h); ++k)
- if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
- kh_del(best, h, k);
-}
-
-static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
-{
- if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
- khint_t k;
- while (1) {
- elem_t *q;
- if (queue->head == queue->tail) break;
- q = &kl_val(queue->head);
- if (q->discarded) {
- q->b->data_len = 0;
- kl_shift(q, queue, 0);
- continue;
- }
- if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
- samwrite(out, q->b);
- q->b->data_len = 0;
- kl_shift(q, queue, 0);
- }
- for (k = kh_begin(h); k != kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- clear_besthash(kh_val(h, k).left, pos);
- clear_besthash(kh_val(h, k).rght, pos);
- }
- }
- }
-}
-
-void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
-{
- bam1_t *b;
- queue_t *queue;
- khint_t k;
- int last_tid = -2;
- khash_t(lib) *aux;
-
- aux = kh_init(lib);
- b = bam_init1();
- queue = kl_init(q);
- while (samread(in, b) >= 0) {
- bam1_core_t *c = &b->core;
- int endpos = bam_calend(c, bam1_cigar(b));
- int score = sum_qual(b);
-
- if (last_tid != c->tid) {
- if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
- last_tid = c->tid;
- } else dump_alignment(out, queue, c->pos, aux);
- if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
- push_queue(queue, b, endpos, score);
- } else {
- const char *lib;
- lib_aux_t *q;
- besthash_t *h;
- uint32_t key;
- int ret;
- lib = bam_get_library(in->header, b);
- q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
- ++q->n_checked;
- h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
- key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
- k = kh_put(best, h, key, &ret);
- if (ret == 0) { // in the hash table
- elem_t *p = kh_val(h, k);
- ++q->n_removed;
- if (p->score < score) {
- if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
- p->discarded = 1;
- kh_val(h, k) = push_queue(queue, b, endpos, score);
- } else { // replace
- p->score = score; p->endpos = endpos;
- bam_copy1(p->b, b);
- }
- } // otherwise, discard the alignment
- } else kh_val(h, k) = push_queue(queue, b, endpos, score);
- }
- }
- dump_alignment(out, queue, MAX_POS, aux);
-
- for (k = kh_begin(aux); k != kh_end(aux); ++k) {
- if (kh_exist(aux, k)) {
- lib_aux_t *q = &kh_val(aux, k);
- fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
- (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
- kh_destroy(best, q->left); kh_destroy(best, q->rght);
- free((char*)kh_key(aux, k));
- }
- }
- kh_destroy(lib, aux);
- bam_destroy1(b);
- kl_destroy(q, queue);
-}
diff --git a/sam/bam_sort.c b/sam/bam_sort.c
deleted file mode 100644
index c46bce3..0000000
--- a/sam/bam_sort.c
+++ /dev/null
@@ -1,571 +0,0 @@
-#include <stdlib.h>
-#include <ctype.h>
-#include <assert.h>
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include "bam.h"
-#include "ksort.h"
-
-static int g_is_by_qname = 0;
-
-static int strnum_cmp(const char *_a, const char *_b)
-{
- const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
- const unsigned char *pa = a, *pb = b;
- while (*pa && *pb) {
- if (isdigit(*pa) && isdigit(*pb)) {
- while (*pa == '0') ++pa;
- while (*pb == '0') ++pb;
- while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
- if (isdigit(*pa) && isdigit(*pb)) {
- int i = 0;
- while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
- return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
- } else if (isdigit(*pa)) return 1;
- else if (isdigit(*pb)) return -1;
- else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
- } else {
- if (*pa != *pb) return (int)*pa - (int)*pb;
- ++pa; ++pb;
- }
- }
- return *pa? 1 : *pb? -1 : 0;
-}
-
-#define HEAP_EMPTY 0xffffffffffffffffull
-
-typedef struct {
- int i;
- uint64_t pos, idx;
- bam1_t *b;
-} heap1_t;
-
-#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
-
-static inline int heap_lt(const heap1_t a, const heap1_t b)
-{
- if (g_is_by_qname) {
- int t;
- if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
- t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
- return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
- } else return __pos_cmp(a, b);
-}
-
-KSORT_INIT(heap, heap1_t, heap_lt)
-
-static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
-{
- bam_header_t t;
- t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
- t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
- t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
-}
-
-static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
-{
- int tempi;
- char *temps;
- tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
- temps = h1->text, h1->text = h2->text, h2->text = temps;
-}
-
-#define MERGE_RG 1
-#define MERGE_UNCOMP 2
-#define MERGE_LEVEL1 4
-#define MERGE_FORCE 8
-
-/*!
- @abstract Merge multiple sorted BAM.
- @param is_by_qname whether to sort by query name
- @param out output BAM file name
- @param headers name of SAM file from which to copy '@' header lines,
- or NULL to copy them from the first file to be merged
- @param n number of files to be merged
- @param fn names of files to be merged
-
- @discussion Padding information may NOT correctly maintained. This
- function is NOT thread safe.
- */
-int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level)
-{
- bamFile fpout, *fp;
- heap1_t *heap;
- bam_header_t *hout = 0;
- bam_header_t *hheaders = NULL;
- int i, j, *RG_len = 0;
- uint64_t idx = 0;
- char **RG = 0, mode[8];
- bam_iter_t *iter = 0;
-
- if (headers) {
- tamFile fpheaders = sam_open(headers);
- if (fpheaders == 0) {
- const char *message = strerror(errno);
- fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
- return -1;
- }
- hheaders = sam_header_read(fpheaders);
- sam_close(fpheaders);
- }
-
- g_is_by_qname = by_qname;
- fp = (bamFile*)calloc(n, sizeof(bamFile));
- heap = (heap1_t*)calloc(n, sizeof(heap1_t));
- iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
- // prepare RG tag
- if (flag & MERGE_RG) {
- RG = (char**)calloc(n, sizeof(void*));
- RG_len = (int*)calloc(n, sizeof(int));
- for (i = 0; i != n; ++i) {
- int l = strlen(fn[i]);
- const char *s = fn[i];
- if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
- for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
- ++j; l -= j;
- RG[i] = calloc(l + 1, 1);
- RG_len[i] = l;
- strncpy(RG[i], s + j, l);
- }
- }
- // read the first
- for (i = 0; i != n; ++i) {
- bam_header_t *hin;
- fp[i] = bam_open(fn[i], "r");
- if (fp[i] == 0) {
- int j;
- fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
- for (j = 0; j < i; ++j) bam_close(fp[j]);
- free(fp); free(heap);
- // FIXME: possible memory leak
- return -1;
- }
- hin = bam_header_read(fp[i]);
- if (i == 0) { // the first BAM
- hout = hin;
- } else { // validate multiple baf
- int min_n_targets = hout->n_targets;
- if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
-
- for (j = 0; j < min_n_targets; ++j)
- if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
- fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
- hout->target_name[j], hin->target_name[j], fn[i]);
- return -1;
- }
-
- // If this input file has additional target reference sequences,
- // add them to the headers to be output
- if (hin->n_targets > hout->n_targets) {
- swap_header_targets(hout, hin);
- // FIXME Possibly we should also create @SQ text headers
- // for the newly added reference sequences
- }
-
- bam_header_destroy(hin);
- }
- }
-
- if (hheaders) {
- // If the text headers to be swapped in include any @SQ headers,
- // check that they are consistent with the existing binary list
- // of reference information.
- if (hheaders->n_targets > 0) {
- if (hout->n_targets != hheaders->n_targets) {
- fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
- if (!reg) return -1;
- }
- for (j = 0; j < hout->n_targets; ++j)
- if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
- fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
- if (!reg) return -1;
- }
- }
-
- swap_header_text(hout, hheaders);
- bam_header_destroy(hheaders);
- }
-
- if (reg) {
- int tid, beg, end;
- if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
- fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
- return -1;
- }
- for (i = 0; i < n; ++i) {
- bam_index_t *idx;
- idx = bam_index_load(fn[i]);
- iter[i] = bam_iter_query(idx, tid, beg, end);
- bam_index_destroy(idx);
- }
- }
-
- for (i = 0; i < n; ++i) {
- heap1_t *h = heap + i;
- h->i = i;
- h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
- if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
- h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
- h->idx = idx++;
- }
- else h->pos = HEAP_EMPTY;
- }
- if (flag & MERGE_UNCOMP) level = 0;
- else if (flag & MERGE_LEVEL1) level = 1;
- strcpy(mode, "w");
- if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
- if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
- fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
- return -1;
- }
- bam_header_write(fpout, hout);
- bam_header_destroy(hout);
- if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
-
- ks_heapmake(heap, n, heap);
- while (heap->pos != HEAP_EMPTY) {
- bam1_t *b = heap->b;
- if (flag & MERGE_RG) {
- uint8_t *rg = bam_aux_get(b, "RG");
- if (rg) bam_aux_del(b, rg);
- bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
- }
- bam_write1_core(fpout, &b->core, b->data_len, b->data);
- if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
- heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
- heap->idx = idx++;
- } else if (j == -1) {
- heap->pos = HEAP_EMPTY;
- free(heap->b->data); free(heap->b);
- heap->b = 0;
- } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
- ks_heapadjust(heap, 0, n, heap);
- }
-
- if (flag & MERGE_RG) {
- for (i = 0; i != n; ++i) free(RG[i]);
- free(RG); free(RG_len);
- }
- for (i = 0; i != n; ++i) {
- bam_iter_destroy(iter[i]);
- bam_close(fp[i]);
- }
- bam_close(fpout);
- free(fp); free(heap); free(iter);
- return 0;
-}
-
-int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
-{
- return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1);
-}
-
-int bam_merge(int argc, char *argv[])
-{
- int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
- char *fn_headers = NULL, *reg = 0;
-
- while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) {
- switch (c) {
- case 'r': flag |= MERGE_RG; break;
- case 'f': flag |= MERGE_FORCE; break;
- case 'h': fn_headers = strdup(optarg); break;
- case 'n': is_by_qname = 1; break;
- case '1': flag |= MERGE_LEVEL1; break;
- case 'u': flag |= MERGE_UNCOMP; break;
- case 'R': reg = strdup(optarg); break;
- case 'l': level = atoi(optarg); break;
- case '@': n_threads = atoi(optarg); break;
- }
- }
- if (optind + 2 >= argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
- fprintf(stderr, "Options: -n sort by read names\n");
- fprintf(stderr, " -r attach RG tag (inferred from file names)\n");
- fprintf(stderr, " -u uncompressed BAM output\n");
- fprintf(stderr, " -f overwrite the output BAM if exist\n");
- fprintf(stderr, " -1 compress level 1\n");
- fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
- fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
- fprintf(stderr, " -R STR merge file in the specified region STR [all]\n");
- fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
- fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
- fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n");
- fprintf(stderr, " the header dictionary in merging.\n\n");
- return 1;
- }
- if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
- FILE *fp = fopen(argv[optind], "rb");
- if (fp != NULL) {
- fclose(fp);
- fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
- return 1;
- }
- }
- if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1;
- free(reg);
- free(fn_headers);
- return ret;
-}
-
-/***************
- * BAM sorting *
- ***************/
-
-#include <pthread.h>
-
-typedef bam1_t *bam1_p;
-
-static int change_SO(bam_header_t *h, const char *so)
-{
- char *p, *q, *beg = 0, *end = 0, *newtext;
- if (h->l_text > 3) {
- if (strncmp(h->text, "@HD", 3) == 0) {
- if ((p = strchr(h->text, '\n')) == 0) return -1;
- *p = '\0';
- if ((q = strstr(h->text, "\tSO:")) != 0) {
- *p = '\n'; // change back
- if (strncmp(q + 4, so, p - q - 4) != 0) {
- beg = q;
- for (q += 4; *q != '\n' && *q != '\t'; ++q);
- end = q;
- } else return 0; // no need to change
- } else beg = end = p, *p = '\n';
- }
- }
- if (beg == 0) { // no @HD
- h->l_text += strlen(so) + 15;
- newtext = malloc(h->l_text + 1);
- sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
- strcat(newtext, h->text);
- } else { // has @HD but different or no SO
- h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
- newtext = malloc(h->l_text + 1);
- strncpy(newtext, h->text, beg - h->text);
- sprintf(newtext + (beg - h->text), "\tSO:%s", so);
- strcat(newtext, end);
- }
- free(h->text);
- h->text = newtext;
- return 0;
-}
-
-static inline int bam1_lt(const bam1_p a, const bam1_p b)
-{
- if (g_is_by_qname) {
- int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
- return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
- } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b)));
-}
-KSORT_INIT(sort, bam1_p, bam1_lt)
-
-typedef struct {
- size_t buf_len;
- const char *prefix;
- bam1_p *buf;
- const bam_header_t *h;
- int index;
-} worker_t;
-
-static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads)
-{
- size_t i;
- bamFile fp;
- fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode);
- if (fp == 0) return;
- bam_header_write(fp, h);
- if (n_threads > 1) bgzf_mt(fp, n_threads, 256);
- for (i = 0; i < l; ++i)
- bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
- bam_close(fp);
-}
-
-static void *worker(void *data)
-{
- worker_t *w = (worker_t*)data;
- char *name;
- ks_mergesort(sort, w->buf_len, w->buf, 0);
- name = (char*)calloc(strlen(w->prefix) + 20, 1);
- sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0);
- free(name);
- return 0;
-}
-
-static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads)
-{
- int i;
- size_t rest;
- bam1_p *b;
- pthread_t *tid;
- pthread_attr_t attr;
- worker_t *w;
-
- if (n_threads < 1) n_threads = 1;
- if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
- pthread_attr_init(&attr);
- pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
- w = calloc(n_threads, sizeof(worker_t));
- tid = calloc(n_threads, sizeof(pthread_t));
- b = buf; rest = k;
- for (i = 0; i < n_threads; ++i) {
- w[i].buf_len = rest / (n_threads - i);
- w[i].buf = b;
- w[i].prefix = prefix;
- w[i].h = h;
- w[i].index = n_files + i;
- b += w[i].buf_len; rest -= w[i].buf_len;
- pthread_create(&tid[i], &attr, worker, &w[i]);
- }
- for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
- free(tid); free(w);
- return n_files + n_threads;
-}
-
-/*!
- @abstract Sort an unsorted BAM file based on the chromosome order
- and the leftmost position of an alignment
-
- @param is_by_qname whether to sort by query name
- @param fn name of the file to be sorted
- @param prefix prefix of the output and the temporary files; upon
- sucessess, prefix.bam will be written.
- @param max_mem approxiate maximum memory (very inaccurate)
- @param full_path the given output path is the full path and not just the prefix
-
- @discussion It may create multiple temporary subalignment files
- and then merge them by calling bam_merge_core(). This function is
- NOT thread safe.
- */
-void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int full_path)
-{
- int ret, i, n_files = 0;
- size_t mem, max_k, k, max_mem;
- bam_header_t *header;
- bamFile fp;
- bam1_t *b, **buf;
- char *fnout = 0;
- char const *suffix = ".bam";
- if (full_path) suffix += 4;
-
- if (n_threads < 2) n_threads = 1;
- g_is_by_qname = is_by_qname;
- max_k = k = 0; mem = 0;
- max_mem = _max_mem * n_threads;
- buf = 0;
- fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
- if (fp == 0) {
- fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
- return;
- }
- header = bam_header_read(fp);
- if (is_by_qname) change_SO(header, "queryname");
- else change_SO(header, "coordinate");
- // write sub files
- for (;;) {
- if (k == max_k) {
- size_t old_max = max_k;
- max_k = max_k? max_k<<1 : 0x10000;
- buf = realloc(buf, max_k * sizeof(void*));
- memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
- }
- if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
- b = buf[k];
- if ((ret = bam_read1(fp, b)) < 0) break;
- if (b->data_len < b->m_data>>2) { // shrink
- b->m_data = b->data_len;
- kroundup32(b->m_data);
- b->data = realloc(b->data, b->m_data);
- }
- mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
- ++k;
- if (mem >= max_mem) {
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
- mem = k = 0;
- }
- }
- if (ret != -1)
- fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
- // output file name
- fnout = calloc(strlen(prefix) + 20, 1);
- if (is_stdout) sprintf(fnout, "-");
- else sprintf(fnout, "%s%s", prefix, suffix);
- // write the final output
- if (n_files == 0) { // a single block
- char mode[8];
- strcpy(mode, "w");
- if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
- ks_mergesort(sort, k, buf, 0);
- write_buffer(fnout, mode, k, buf, header, n_threads);
- } else { // then merge
- char **fns;
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
- fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
- fns = (char**)calloc(n_files, sizeof(char*));
- for (i = 0; i < n_files; ++i) {
- fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
- sprintf(fns[i], "%s.%.4d%s", prefix, i, suffix);
- }
- bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
- for (i = 0; i < n_files; ++i) {
- unlink(fns[i]);
- free(fns[i]);
- }
- free(fns);
- }
- free(fnout);
- // free
- for (k = 0; k < max_k; ++k) {
- if (!buf[k]) continue;
- free(buf[k]->data);
- free(buf[k]);
- }
- free(buf);
- bam_header_destroy(header);
- bam_close(fp);
-}
-
-void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
-{
- bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1, 0);
-}
-
-int bam_sort(int argc, char *argv[])
-{
- size_t max_mem = 768<<20; // 512MB
- int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1, full_path = 0;
- while ((c = getopt(argc, argv, "fnom:@:l:")) >= 0) {
- switch (c) {
- case 'f': full_path = 1; break;
- case 'o': is_stdout = 1; break;
- case 'n': is_by_qname = 1; break;
- case 'm': {
- char *q;
- max_mem = strtol(optarg, &q, 0);
- if (*q == 'k' || *q == 'K') max_mem <<= 10;
- else if (*q == 'm' || *q == 'M') max_mem <<= 20;
- else if (*q == 'g' || *q == 'G') max_mem <<= 30;
- break;
- }
- case '@': n_threads = atoi(optarg); break;
- case 'l': level = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools sort [options] <in.bam> <out.prefix>\n\n");
- fprintf(stderr, "Options: -n sort by read name\n");
- fprintf(stderr, " -f use <out.prefix> as full file name instead of prefix\n");
- fprintf(stderr, " -o final output to stdout\n");
- fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
- fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n");
- fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n");
- fprintf(stderr, "\n");
- return 1;
- }
- bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level, full_path);
- return 0;
-}
diff --git a/sam/bam_stat.c b/sam/bam_stat.c
deleted file mode 100644
index f2de0f1..0000000
--- a/sam/bam_stat.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <unistd.h>
-#include <assert.h>
-#include "bam.h"
-
-typedef struct {
- long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
- long long n_sgltn[2], n_read1[2], n_read2[2];
- long long n_dup[2];
- long long n_diffchr[2], n_diffhigh[2];
-} bam_flagstat_t;
-
-#define flagstat_loop(s, c) do { \
- int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \
- ++(s)->n_reads[w]; \
- if ((c)->flag & BAM_FPAIRED) { \
- ++(s)->n_pair_all[w]; \
- if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \
- if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \
- if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \
- if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \
- if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
- ++(s)->n_pair_map[w]; \
- if ((c)->mtid != (c)->tid) { \
- ++(s)->n_diffchr[w]; \
- if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \
- } \
- } \
- } \
- if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \
- if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
- } while (0)
-
-bam_flagstat_t *bam_flagstat_core(bamFile fp)
-{
- bam_flagstat_t *s;
- bam1_t *b;
- bam1_core_t *c;
- int ret;
- s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
- b = bam_init1();
- c = &b->core;
- while ((ret = bam_read1(fp, b)) >= 0)
- flagstat_loop(s, c);
- bam_destroy1(b);
- if (ret != -1)
- fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
- return s;
-}
-int bam_flagstat(int argc, char *argv[])
-{
- bamFile fp;
- bam_header_t *header;
- bam_flagstat_t *s;
- if (argc == optind) {
- fprintf(stderr, "Usage: samtools flagstat <in.bam>\n");
- return 1;
- }
- fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- assert(fp);
- header = bam_header_read(fp);
- s = bam_flagstat_core(fp);
- printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
- printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
- printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
- printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
- printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
- printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
- printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0);
- printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
- printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0);
- printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
- printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
- free(s);
- bam_header_destroy(header);
- bam_close(fp);
- return 0;
-}
diff --git a/sam/bam_tview.c b/sam/bam_tview.c
deleted file mode 100644
index 06d5e33..0000000
--- a/sam/bam_tview.c
+++ /dev/null
@@ -1,368 +0,0 @@
-#include <assert.h>
-#include "bam_tview.h"
-
-int base_tv_init(tview_t* tv,const char *fn, const char *fn_fa, const char *samples)
- {
- assert(tv!=NULL);
- assert(fn!=NULL);
- tv->mrow = 24; tv->mcol = 80;
- tv->color_for = TV_COLOR_MAPQ;
- tv->is_dot = 1;
-
- tv->fp = bam_open(fn, "r");
- if(tv->fp==0)
- {
- fprintf(stderr,"bam_open %s. %s\n", fn,fn_fa);
- exit(EXIT_FAILURE);
- }
- bgzf_set_cache_size(tv->fp, 8 * 1024 *1024);
- assert(tv->fp);
-
- tv->header = bam_header_read(tv->fp);
- if(tv->header==0)
- {
- fprintf(stderr,"Cannot read '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->idx = bam_index_load(fn);
- if (tv->idx == 0)
- {
- fprintf(stderr,"Cannot read index for '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
- if (fn_fa) tv->fai = fai_load(fn_fa);
- tv->bca = bcf_call_init(0.83, 13);
- tv->ins = 1;
-
- if ( samples )
- {
- if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text);
- void *iter = tv->header->dict;
- const char *key, *val;
- int n = 0;
- tv->rg_hash = kh_init(kh_rg);
- while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
- {
- if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) )
- {
- khiter_t k = kh_get(kh_rg, tv->rg_hash, key);
- if ( k != kh_end(tv->rg_hash) ) continue;
- int ret;
- k = kh_put(kh_rg, tv->rg_hash, key, &ret);
- kh_value(tv->rg_hash, k) = val;
- n++;
- }
- }
- if ( !n )
- {
- fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples);
- exit(EXIT_FAILURE);
- }
- }
-
- return 0;
- }
-
-
-void base_tv_destroy(tview_t* tv)
- {
- bam_lplbuf_destroy(tv->lplbuf);
- bcf_call_destroy(tv->bca);
- bam_index_destroy(tv->idx);
- if (tv->fai) fai_destroy(tv->fai);
- free(tv->ref);
- bam_header_destroy(tv->header);
- bam_close(tv->fp);
- }
-
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- extern unsigned char bam_nt16_table[256];
- tview_t *tv = (tview_t*)data;
- int i, j, c, rb, attr, max_ins = 0;
- uint32_t call = 0;
- if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
- // print referece
- rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
- for (i = tv->last_pos + 1; i < pos; ++i) {
- if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1);
- c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- { // call consensus
- bcf_callret1_t bcr;
- int qsum[4], a1, a2, tmp;
- double p[3], prior = 30;
- bcf_call_glfgen(n, pl, bam_nt16_table[rb], tv->bca, &bcr);
- for (i = 0; i < 4; ++i) qsum[i] = bcr.qsum[i]<<2 | i;
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
- tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
- a1 = qsum[0]&3; a2 = qsum[1]&3;
- p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
- if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
- if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
- if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
- else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
- else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
- }
- attr = tv->my_underline(tv);
- c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
- i = (call&0xffff)/10+1;
- if (i > 4) i = 4;
- attr |= tv->my_colorpair(tv,i);
- if (c == toupper(rb)) c = '.';
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,2, tv->ccol, c);
- tv->my_attroff(tv,attr);
- if(tv->ins) {
- // calculate maximum insert
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
- }
- }
- // core loop
- for (j = 0; j <= max_ins; ++j) {
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- int row = TV_MIN_ALNROW + p->level - tv->row_shift;
- if (j == 0) {
- if (!p->is_del) {
- if (tv->base_for == TV_BASE_COLOR_SPACE &&
- (c = bam_aux_getCSi(p->b, p->qpos))) {
- // assume that if we found one color, we will be able to get the color error
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.';
- } else {
- if (tv->show_name) {
- char *name = bam1_qname(p->b);
- c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
- } else {
- c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
- if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
- }
- }
- } else c = p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*';
- } else { // padding
- if (j > p->indel) c = '*';
- else { // insertion
- if (tv->base_for == TV_BASE_NUCL) {
- if (tv->show_name) {
- char *name = bam1_qname(p->b);
- c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
- } else {
- c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
- if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
- }
- } else {
- c = bam_aux_getCSi(p->b, p->qpos + j);
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.';
- }
- }
- }
- if (row > TV_MIN_ALNROW && row < tv->mrow) {
- int x;
- attr = 0;
- if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
- || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
- if (tv->color_for == TV_COLOR_BASEQ) {
- x = bam1_qual(p->b)[p->qpos]/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_MAPQ) {
- x = p->b->core.qual/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_NUCL) {
- x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COL) {
- x = 0;
- switch(bam_aux_getCSi(p->b, p->qpos)) {
- case '0': x = 0; break;
- case '1': x = 1; break;
- case '2': x = 2; break;
- case '3': x = 3; break;
- case '4': x = 4; break;
- default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break;
- }
- x+=5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COLQ) {
- x = bam_aux_getCQi(p->b, p->qpos);
- if(0 == x) x = bam1_qual(p->b)[p->qpos];
- x = x/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- }
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c));
- tv->my_attroff(tv,attr);
- }
- }
- c = j? '*' : rb;
- if (c == '*') {
- attr = tv->my_colorpair(tv,8);
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- tv->my_attroff(tv,attr);
- } else tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- tv->last_pos = pos;
- return 0;
-}
-
-
-
-
-int tv_fetch_func(const bam1_t *b, void *data)
-{
- tview_t *tv = (tview_t*)data;
- if ( tv->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(b, "RG");
- if ( !rg ) return 0;
- khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
- if ( k == kh_end(tv->rg_hash) ) return 0;
- }
- if (tv->no_skip) {
- uint32_t *cigar = bam1_cigar(b); // this is cheating...
- int i;
- for (i = 0; i <b->core.n_cigar; ++i) {
- if ((cigar[i]&0xf) == BAM_CREF_SKIP)
- cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
- }
- }
- bam_lplbuf_push(b, tv->lplbuf);
- return 0;
-}
-
-int base_draw_aln(tview_t *tv, int tid, int pos)
- {
- assert(tv!=NULL);
- // reset
- tv->my_clear(tv);
- tv->curr_tid = tid; tv->left_pos = pos;
- tv->last_pos = tv->left_pos - 1;
- tv->ccol = 0;
- // print ref and consensus
- if (tv->fai) {
- char *str;
- if (tv->ref) free(tv->ref);
- assert(tv->curr_tid>=0);
-
- str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
- assert(str!=NULL);
- sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
- tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
- free(str);
- }
- // draw aln
- bam_lplbuf_reset(tv->lplbuf);
- bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
- bam_lplbuf_push(0, tv->lplbuf);
-
- while (tv->ccol < tv->mcol) {
- int pos = tv->last_pos + 1;
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
- ++tv->last_pos;
- }
- return 0;
-}
-
-
-
-
-static void error(const char *format, ...)
-{
- if ( !format )
- {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bamtk tview [options] <aln.bam> [ref.fasta]\n");
- fprintf(stderr, "Options:\n");
- fprintf(stderr, " -d display output as (H)tml or (C)urses or (T)ext \n");
- fprintf(stderr, " -p chr:pos go directly to this position\n");
- fprintf(stderr, " -s STR display only reads from this sample or group\n");
- fprintf(stderr, "\n\n");
- }
- else
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- }
- exit(-1);
-}
-
-enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
-
-int bam_tview_main(int argc, char *argv[])
- {
- int view_mode=display_ncurses;
- tview_t* tv=NULL;
- char *samples=NULL, *position=NULL;
- int c;
- while ((c = getopt(argc, argv, "s:p:d:")) >= 0) {
- switch (c) {
- case 's': samples=optarg; break;
- case 'p': position=optarg; break;
- case 'd':
- {
- switch(optarg[0])
- {
- case 'H': case 'h': view_mode=display_html;break;
- case 'T': case 't': view_mode=display_text;break;
- case 'C': case 'c': view_mode=display_ncurses;break;
- default: view_mode=display_ncurses;break;
- }
- break;
- }
- default: error(NULL);
- }
- }
- if (argc==optind) error(NULL);
-
- switch(view_mode)
- {
- case display_ncurses:
- {
- tv = curses_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
- break;
- }
- case display_text:
- {
- tv = text_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
- break;
- }
- case display_html:
- {
- tv = html_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples);
- break;
- }
- }
- if(tv==NULL)
- {
- error("cannot create view");
- return EXIT_FAILURE;
- }
-
- if ( position )
- {
- int _tid = -1, _beg, _end;
- bam_parse_region(tv->header, position, &_tid, &_beg, &_end);
- if (_tid >= 0) { tv->curr_tid = _tid; tv->left_pos = _beg; }
- }
- tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- tv->my_loop(tv);
- tv->my_destroy(tv);
-
- return EXIT_SUCCESS;
- }
diff --git a/sam/bam_tview.h b/sam/bam_tview.h
deleted file mode 100644
index 80f0464..0000000
--- a/sam/bam_tview.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef BAM_TVIEW_H
-#define BAM_TVIEW_H
-
-#include <ctype.h>
-#include <assert.h>
-#include <string.h>
-#include <math.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include "bam.h"
-#include "faidx.h"
-#include "bam2bcf.h"
-#include "sam_header.h"
-#include "khash.h"
-
-KHASH_MAP_INIT_STR(kh_rg, const char *)
-
-typedef struct AbstractTview {
- int mrow, mcol;
-
- bam_index_t *idx;
- bam_lplbuf_t *lplbuf;
- bam_header_t *header;
- bamFile fp;
- int curr_tid, left_pos;
- faidx_t *fai;
- bcf_callaux_t *bca;
-
- int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;
- char *ref;
- khash_t(kh_rg) *rg_hash;
- /* callbacks */
- void (*my_destroy)(struct AbstractTview* );
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_mvaddch)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
- int (*my_drawaln)(struct AbstractTview*,int,int);
- int (*my_loop)(struct AbstractTview*);
- int (*my_underline)(struct AbstractTview*);
-} tview_t;
-
-
-char bam_aux_getCEi(bam1_t *b, int i);
-char bam_aux_getCSi(bam1_t *b, int i);
-char bam_aux_getCQi(bam1_t *b, int i);
-
-#define TV_MIN_ALNROW 2
-#define TV_MAX_GOTO 40
-#define TV_LOW_MAPQ 10
-
-#define TV_COLOR_MAPQ 0
-#define TV_COLOR_BASEQ 1
-#define TV_COLOR_NUCL 2
-#define TV_COLOR_COL 3
-#define TV_COLOR_COLQ 4
-
-#define TV_BASE_NUCL 0
-#define TV_BASE_COLOR_SPACE 1
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
-int base_tv_init(tview_t*,const char *fn, const char *fn_fa, const char *samples);
-void base_tv_destroy(tview_t*);
-int base_draw_aln(tview_t *tv, int tid, int pos);
-
-typedef struct Tixel
- {
- int ch;
- int attributes;
- }tixel_t;
-
-#endif
-
diff --git a/sam/bam_tview_curses.c b/sam/bam_tview_curses.c
deleted file mode 100644
index 4fdd1fb..0000000
--- a/sam/bam_tview_curses.c
+++ /dev/null
@@ -1,297 +0,0 @@
-#undef _HAVE_CURSES
-
-#if _CURSES_LIB == 0
-#elif _CURSES_LIB == 1
-#include <curses.h>
-#ifndef NCURSES_VERSION
-#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled"
-#else
-#define _HAVE_CURSES
-#endif
-#elif _CURSES_LIB == 2
-#include <xcurses.h>
-#define _HAVE_CURSES
-#else
-#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled"
-#endif
-
-
-#include "bam_tview.h"
-
-#ifdef _HAVE_CURSES
-
-
-
-typedef struct CursesTview {
- tview_t view;
- WINDOW *wgoto, *whelp;
- } curses_tview_t;
-
-
-
-
-#define FROM_TV(ptr) ((curses_tview_t*)ptr)
-
-static void curses_destroy(tview_t* base)
- {
- curses_tview_t* tv=(curses_tview_t*)base;
-
-
- delwin(tv->wgoto); delwin(tv->whelp);
- endwin();
-
- base_tv_destroy(base);
-
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
- mvprintw(y,x,str);
- free(str);
- }
-
-static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- mvaddch(y,x,ch);
- }
-
-static void curses_attron(struct AbstractTview* tv,int flag)
- {
- attron(flag);
- }
-static void curses_attroff(struct AbstractTview* tv,int flag)
- {
- attroff(flag);
- }
-static void curses_clear(struct AbstractTview* tv)
- {
- clear();
- }
-
-static int curses_colorpair(struct AbstractTview* tv,int flag)
- {
- return COLOR_PAIR(flag);
- }
-
-static int curses_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- return base_draw_aln(tv, tid, pos);
- }
-
-
-
-static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
- {
- char str[256], *p;
- int i, l = 0;
- tview_t *base=(tview_t*)tv;
- wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(tv->wgoto, 1, 2, "Goto: ");
- for (;;) {
- int c = wgetch(tv->wgoto);
- wrefresh(tv->wgoto);
- if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
- if(l > 0) --l;
- } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
- int _tid = -1, _beg, _end;
- if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10) - 1;
- if (_beg > 0) {
- *pos = _beg;
- return;
- }
- } else {
- bam_parse_region(base->header, str, &_tid, &_beg, &_end);
- if (_tid >= 0) {
- *tid = _tid; *pos = _beg;
- return;
- }
- }
- } else if (isgraph(c)) {
- if (l < TV_MAX_GOTO) str[l++] = c;
- } else if (c == '\027') l = 0;
- else if (c == '\033') return;
- str[l] = '\0';
- for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
- mvwprintw(tv->wgoto, 1, 8, "%s", str);
- }
-}
-
-
-
-
-static void tv_win_help(curses_tview_t *tv) {
- int r = 1;
- tview_t* base=(tview_t*)base;
- WINDOW *win = tv->whelp;
- wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(win, r++, 2, " -=- Help -=- ");
- r++;
- mvwprintw(win, r++, 2, "? This window");
- mvwprintw(win, r++, 2, "Arrows Small scroll movement");
- mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
- mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
- mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
- mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
- mvwprintw(win, r++, 2, "space Scroll one screen");
- mvwprintw(win, r++, 2, "backspace Scroll back one screen");
- mvwprintw(win, r++, 2, "g Go to specific location");
- mvwprintw(win, r++, 2, "m Color for mapping qual");
- mvwprintw(win, r++, 2, "n Color for nucleotide");
- mvwprintw(win, r++, 2, "b Color for base quality");
- mvwprintw(win, r++, 2, "c Color for cs color");
- mvwprintw(win, r++, 2, "z Color for cs qual");
- mvwprintw(win, r++, 2, ". Toggle on/off dot view");
- mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
- mvwprintw(win, r++, 2, "r Toggle on/off rd name");
- mvwprintw(win, r++, 2, "N Turn on nt view");
- mvwprintw(win, r++, 2, "C Turn on cs view");
- mvwprintw(win, r++, 2, "i Toggle on/off ins");
- mvwprintw(win, r++, 2, "q Exit");
- r++;
- mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
- mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
- mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
- wrefresh(win);
- wgetch(win);
-}
-
-static int curses_underline(tview_t* tv)
- {
- return A_UNDERLINE;
- }
-
-static int curses_loop(tview_t* tv)
- {
- int tid, pos;
- curses_tview_t *CTV=(curses_tview_t *)tv;
- tid = tv->curr_tid; pos = tv->left_pos;
- while (1) {
- int c = getch();
- switch (c) {
- case '?': tv_win_help(CTV); break;
- case '\033':
- case 'q': goto end_loop;
- case '/':
- case 'g': tv_win_goto(CTV, &tid, &pos); break;
- case 'm': tv->color_for = TV_COLOR_MAPQ; break;
- case 'b': tv->color_for = TV_COLOR_BASEQ; break;
- case 'n': tv->color_for = TV_COLOR_NUCL; break;
- case 'c': tv->color_for = TV_COLOR_COL; break;
- case 'z': tv->color_for = TV_COLOR_COLQ; break;
- case 's': tv->no_skip = !tv->no_skip; break;
- case 'r': tv->show_name = !tv->show_name; break;
- case KEY_LEFT:
- case 'h': --pos; break;
- case KEY_RIGHT:
- case 'l': ++pos; break;
- case KEY_SLEFT:
- case 'H': pos -= 20; break;
- case KEY_SRIGHT:
- case 'L': pos += 20; break;
- case '.': tv->is_dot = !tv->is_dot; break;
- case 'N': tv->base_for = TV_BASE_NUCL; break;
- case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
- case 'i': tv->ins = !tv->ins; break;
- case '\010': pos -= 1000; break;
- case '\014': pos += 1000; break;
- case ' ': pos += tv->mcol; break;
- case KEY_UP:
- case 'j': --tv->row_shift; break;
- case KEY_DOWN:
- case 'k': ++tv->row_shift; break;
- case KEY_BACKSPACE:
- case '\177': pos -= tv->mcol; break;
- case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
- default: continue;
- }
- if (pos < 0) pos = 0;
- if (tv->row_shift < 0) tv->row_shift = 0;
- tv->my_drawaln(tv, tid, pos);
- }
-end_loop:
- return 0;
-}
-
-
-
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
- {
- curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(stderr,"Calloc failed\n");
- return 0;
- }
-
- base_tv_init(base,fn,fn_fa,samples);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
- initscr();
- keypad(stdscr, TRUE);
- clear();
- noecho();
- cbreak();
-
- getmaxyx(stdscr, base->mrow, base->mcol);
- tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(29, 40, 5, 5);
-
- start_color();
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_YELLOW, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
- return base;
- }
-
-
-#else // #ifdef _HAVE_CURSES
-#include <stdio.h>
-#warning "No curses library is available; tview with curses is disabled."
-
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples);
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples)
- {
- return text_tv_init(fn,fn_fa,samples);
- }
-#endif // #ifdef _HAVE_CURSES
-
-
diff --git a/sam/bam_tview_html.c b/sam/bam_tview_html.c
deleted file mode 100644
index f52b4c3..0000000
--- a/sam/bam_tview_html.c
+++ /dev/null
@@ -1,349 +0,0 @@
-#include <unistd.h>
-#include "bam_tview.h"
-
-#define UNDERLINE_FLAG 10
-
-typedef struct HtmlTview {
- tview_t view;
- int row_count;
- tixel_t** screen;
- FILE* out;
- int attributes;/* color... */
- } html_tview_t;
-
-#define FROM_TV(ptr) ((html_tview_t*)ptr)
-
-static void html_destroy(tview_t* base)
- {
- int i;
- html_tview_t* tv=(html_tview_t*)base;
- if(tv->screen!=NULL)
- {
- for(i=0;i< tv->row_count;++i) free(tv->screen[i]);
- free(tv->screen);
- }
- base_tv_destroy(base);
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- int i,nchars=0;
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- nchars=vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
-
- for(i=0;i< nchars;++i)
- {
- tv->my_mvaddch(tv,y,x+i,str[i]);
- }
- free(str);
- }
-
-static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- tixel_t* row=NULL;
- html_tview_t* ptr=FROM_TV(tv);
- if( x >= tv->mcol ) return; //out of screen
- while(ptr->row_count<=y)
- {
- int x;
- row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t));
- if(row==0) exit(EXIT_FAILURE);
- for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;}
- ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1));
- ptr->screen[ptr->row_count++]=row;
- }
- row=ptr->screen[y];
- row[x].ch=ch;
- row[x].attributes=ptr->attributes;
- }
-
-static void html_attron(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes |= flag;
-
-
- }
-
-static void html_attroff(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes &= ~(flag);
- }
-
-static void html_clear(struct AbstractTview* tv)
- {
- html_tview_t* ptr=FROM_TV(tv);
- if(ptr->screen!=NULL)
- {
- int i;
- for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]);
- free(ptr->screen);
- ptr->screen=NULL;
- }
- ptr->row_count=0;
- ptr->attributes=0;
- }
-
-static int html_colorpair(struct AbstractTview* tv,int flag)
- {
- return (1 << (flag));
- }
-
-static int html_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- fputs("<html><head>",ptr->out);
- fprintf(ptr->out,"<title>%s:%d</title>",
- tv->header->target_name[tid],
- pos+1
- );
- //style
-
- fputs("<style type='text/css'>\n",ptr->out);
- fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out);
- fputs(".tviewtitle {text-align:center;}\n",ptr->out);
- fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out);
- #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col);
- CSS(0, "black");
- CSS(1, "blue");
- CSS(2, "green");
- CSS(3, "yellow");
- CSS(4, "black");
- CSS(5, "green");
- CSS(6, "cyan");
- CSS(7, "yellow");
- CSS(8, "red");
- CSS(9, "blue");
- #undef CSS
- fputs("</style>",ptr->out);
-
- fputs("</head><body>",ptr->out);
-
- fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>",
- tv->header->target_name[tid],
- pos+1
- );
-
- fputs("<pre class='tviewpre'>",ptr->out);
- for(y=0;y< ptr->row_count;++y)
- {
-
- for(x=0;x< tv->mcol;++x)
- {
-
-
- if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
- {
- int css=0;
- fprintf(ptr->out,"<span");
- while(css<32)
- {
- //if(y>1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
-
- fprintf(ptr->out," class='tviewc%s%d'",
- (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
- css);
- break;
- }
- ++css;
- }
-
-
- fputs(">",ptr->out);
- }
-
- int ch=ptr->screen[y][x].ch;
- switch(ch)
- {
- case '<': fputs("<",ptr->out);break;
- case '>': fputs(">",ptr->out);break;
- case '&': fputs("&",ptr->out);break;
- default: fputc(ch,ptr->out); break;
- }
-
-
- if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
- {
- fputs("</span>",ptr->out);
- }
- }
- if(y+1 < ptr->row_count) fputs("<br/>",ptr->out);
- }
- fputs("</pre></div></body></html>",ptr->out);
- return 0;
- }
-
-
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_BLACK "\x1b[0m"
-#define ANSI_COLOR_RESET ANSI_COLOR_BLACK
-
-#define ANSI_UNDERLINE_SET "\033[4m"
-#define ANSI_UNDERLINE_UNSET "\033[0m"
-
-static int text_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- int is_term= isatty(fileno(ptr->out));
-
- for(y=0;y< ptr->row_count;++y)
- {
- for(x=0;x< tv->mcol;++x)
- {
- if(is_term)
- {
- int css=0;
- while(css<32)
- {
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
- break;
- }
- ++css;
- }
- switch(css)
- {
- //CSS(0, "black");
- case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- //CSS(4, "black");
- case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break;
- case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- case 8: fputs(ANSI_COLOR_RED,ptr->out); break;
- case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- default:break;
- }
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_SET,ptr->out);
- }
-
- }
-
-
- int ch=ptr->screen[y][x].ch;
-
- fputc(ch,ptr->out);
- if(is_term)
- {
- fputs(ANSI_COLOR_RESET,ptr->out);
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_UNSET,ptr->out);
- }
- }
- }
- fputc('\n',ptr->out);
- }
- return 0;
- }
-
-
-static int html_loop(tview_t* tv)
- {
- //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- return 0;
- }
-
-static int html_underline(tview_t* tv)
- {
- return (1 << UNDERLINE_FLAG);
- }
-
-/*
-static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper)
- {
-
- }
-*/
-
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples)
- {
- char* colstr=getenv("COLUMNS");
- html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(stderr,"Calloc failed\n");
- return 0;
- }
- tv->row_count=0;
- tv->screen=NULL;
- tv->out=stdout;
- tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
-
- if(colstr!=0)
- {
- base->mcol=atoi(colstr);
- if(base->mcol<10) base->mcol=80;
- }
- base->mrow=99999;
-
-/*
- init_pair(tv,1, "blue", "white");
- init_pair(tv,2, "green", "white");
- init_pair(tv,3, "yellow", "white");
- init_pair(tv,4, "white", "white");
- init_pair(tv,5, "green", "white");
- init_pair(tv,6, "cyan", "white");
- init_pair(tv,7, "yellow", "white");
- init_pair(tv,8, "red", "white");
- init_pair(tv,9, "blue", "white");
- */
- return base;
- }
-
-
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples)
- {
- tview_t* tv=html_tv_init(fn,fn_fa,samples);
- tv->my_drawaln=text_drawaln;
- return tv;
- }
-
diff --git a/sam/bamshuf.c b/sam/bamshuf.c
deleted file mode 100644
index 33a5238..0000000
--- a/sam/bamshuf.c
+++ /dev/null
@@ -1,141 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include "sam.h"
-#include "ksort.h"
-
-#define DEF_CLEVEL 1
-
-static inline unsigned hash_Wang(unsigned key)
-{
- key += ~(key << 15);
- key ^= (key >> 10);
- key += (key << 3);
- key ^= (key >> 6);
- key += ~(key << 11);
- key ^= (key >> 16);
- return key;
-}
-
-static inline unsigned hash_X31_Wang(const char *s)
-{
- unsigned h = *s;
- if (h) {
- for (++s ; *s; ++s) h = (h << 5) - h + *s;
- return hash_Wang(h);
- } else return 0;
-}
-
-typedef struct {
- unsigned key;
- bam1_t *b;
-} elem_t;
-
-static inline int elem_lt(elem_t x, elem_t y)
-{
- if (x.key < y.key) return 1;
- if (x.key == y.key) {
- int t;
- t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b));
- if (t < 0) return 1;
- return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3)));
- } else return 0;
-}
-
-KSORT_INIT(bamshuf, elem_t, elem_lt)
-
-static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout)
-{
- BGZF *fp, *fpw, **fpt;
- char **fnt, modew[8];
- bam1_t *b;
- int i, l;
- bam_hdr_t *h;
- int64_t *cnt;
-
- // split
- fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r");
- assert(fp);
- h = bam_hdr_read(fp);
- fnt = (char**)calloc(n_files, sizeof(void*));
- fpt = (BGZF**)calloc(n_files, sizeof(void*));
- cnt = (int64_t*)calloc(n_files, 8);
- l = strlen(pre);
- for (i = 0; i < n_files; ++i) {
- fnt[i] = (char*)calloc(l + 10, 1);
- sprintf(fnt[i], "%s.%.4d.bam", pre, i);
- fpt[i] = bgzf_open(fnt[i], "w1");
- bam_hdr_write(fpt[i], h);
- }
- b = bam_init1();
- while (bam_read1(fp, b) >= 0) {
- uint32_t x;
- x = hash_X31_Wang(bam_get_qname(b)) % n_files;
- bam_write1(fpt[x], b);
- ++cnt[x];
- }
- bam_destroy1(b);
- for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]);
- free(fpt);
- bgzf_close(fp);
- // merge
- sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
- if (!is_stdout) { // output to a file
- char *fnw = (char*)calloc(l + 5, 1);
- sprintf(fnw, "%s.bam", pre);
- fpw = bgzf_open(fnw, modew);
- free(fnw);
- } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout
- bam_hdr_write(fpw, h);
- bam_hdr_destroy(h);
- for (i = 0; i < n_files; ++i) {
- int64_t j, c = cnt[i];
- elem_t *a;
- fp = bgzf_open(fnt[i], "r");
- bam_hdr_destroy(bam_hdr_read(fp));
- a = (elem_t*)calloc(c, sizeof(elem_t));
- for (j = 0; j < c; ++j) {
- a[j].b = bam_init1();
- assert(bam_read1(fp, a[j].b) >= 0);
- a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
- }
- bgzf_close(fp);
- unlink(fnt[i]);
- free(fnt[i]);
- ks_introsort(bamshuf, c, a);
- for (j = 0; j < c; ++j) {
- bam_write1(fpw, a[j].b);
- bam_destroy1(a[j].b);
- }
- free(a);
- }
- bgzf_close(fpw);
- free(fnt); free(cnt);
-}
-
-int main_bamshuf(int argc, char *argv[])
-{
- int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
- while ((c = getopt(argc, argv, "n:l:uO")) >= 0) {
- switch (c) {
- case 'n': n_files = atoi(optarg); break;
- case 'l': clevel = atoi(optarg); break;
- case 'u': is_un = 1; break;
- case 'O': is_stdout = 1; break;
- }
- }
- if (is_un) clevel = 0;
- if (optind + 2 > argc) {
- fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n");
- fprintf(stderr, "Options: -O output to stdout\n");
- fprintf(stderr, " -u uncompressed BAM output\n");
- fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL);
- fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files);
- fprintf(stderr, "\n");
- return 1;
- }
- bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout);
- return 0;
-}
diff --git a/sam/bamtk.c b/sam/bamtk.c
deleted file mode 100644
index 9df7c11..0000000
--- a/sam/bamtk.c
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <assert.h>
-#include <fcntl.h>
-#include "bam.h"
-
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-
-int bam_taf2baf(int argc, char *argv[]);
-int bam_mpileup(int argc, char *argv[]);
-int bam_merge(int argc, char *argv[]);
-int bam_index(int argc, char *argv[]);
-int bam_sort(int argc, char *argv[]);
-int bam_tview_main(int argc, char *argv[]);
-int bam_mating(int argc, char *argv[]);
-int bam_rmdup(int argc, char *argv[]);
-int bam_flagstat(int argc, char *argv[]);
-int bam_fillmd(int argc, char *argv[]);
-int bam_idxstats(int argc, char *argv[]);
-int main_samview(int argc, char *argv[]);
-int main_import(int argc, char *argv[]);
-int main_reheader(int argc, char *argv[]);
-int main_cut_target(int argc, char *argv[]);
-int main_phase(int argc, char *argv[]);
-int main_cat(int argc, char *argv[]);
-int main_depth(int argc, char *argv[]);
-int main_bam2fq(int argc, char *argv[]);
-int main_pad2unpad(int argc, char *argv[]);
-int main_bedcov(int argc, char *argv[]);
-int main_bamshuf(int argc, char *argv[]);
-
-int faidx_main(int argc, char *argv[]);
-
-static int usage()
-{
- fprintf(stderr, "\n");
- fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n");
- fprintf(stderr, "Version: %s\n\n", BAM_VERSION);
- fprintf(stderr, "Usage: samtools <command> [options]\n\n");
- fprintf(stderr, "Command: view SAM<->BAM conversion\n");
- fprintf(stderr, " sort sort alignment file\n");
- fprintf(stderr, " mpileup multi-way pileup\n");
- fprintf(stderr, " depth compute the depth\n");
- fprintf(stderr, " faidx index/extract FASTA\n");
-#if _CURSES_LIB != 0
- fprintf(stderr, " tview text alignment viewer\n");
-#endif
- fprintf(stderr, " index index alignment\n");
- fprintf(stderr, " idxstats BAM index stats (r595 or later)\n");
- fprintf(stderr, " fixmate fix mate information\n");
- fprintf(stderr, " flagstat simple stats\n");
- fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n");
- fprintf(stderr, " merge merge sorted alignments\n");
- fprintf(stderr, " rmdup remove PCR duplicates\n");
- fprintf(stderr, " reheader replace BAM header\n");
- fprintf(stderr, " cat concatenate BAMs\n");
- fprintf(stderr, " bedcov read depth per BED region\n");
- fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n");
- fprintf(stderr, " phase phase heterozygotes\n");
- fprintf(stderr, " bamshuf shuffle and group alignments by name\n");
-// fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable
- fprintf(stderr, "\n");
-#ifdef _WIN32
- fprintf(stderr, "\
-Note: The Windows version of SAMtools is mainly designed for read-only\n\
- operations, such as viewing the alignments and generating the pileup.\n\
- Binary files generated by the Windows version may be buggy.\n\n");
-#endif
- return 1;
-}
-
-int main(int argc, char *argv[])
-{
-#ifdef _WIN32
- setmode(fileno(stdout), O_BINARY);
- setmode(fileno(stdin), O_BINARY);
-#ifdef _USE_KNETFILE
- knet_win32_init();
-#endif
-#endif
- if (argc < 2) return usage();
- if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1);
- else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1);
- else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1);
- else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
- else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
- else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
- else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1);
- else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
- else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
- else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
- else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
- else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
- else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
- else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
- else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1);
- else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
- else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
- else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1);
- else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1);
- else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1);
- else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1);
- else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1);
- else if (strcmp(argv[1], "bamshuf") == 0) return main_bamshuf(argc-1, argv+1);
- else if (strcmp(argv[1], "pileup") == 0) {
- fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
- return 1;
- }
-#if _CURSES_LIB != 0
- else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
-#endif
- else {
- fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
- return 1;
- }
- return 0;
-}
diff --git a/sam/bcftools/._Makefile b/sam/bcftools/._Makefile
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._Makefile and /dev/null differ
diff --git a/sam/bcftools/._README b/sam/bcftools/._README
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._README and /dev/null differ
diff --git a/sam/bcftools/._bcf.c b/sam/bcftools/._bcf.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._bcf.c and /dev/null differ
diff --git a/sam/bcftools/._bcf.h b/sam/bcftools/._bcf.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._bcf.h and /dev/null differ
diff --git a/sam/bcftools/._bcf.tex b/sam/bcftools/._bcf.tex
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._bcf.tex and /dev/null differ
diff --git a/sam/bcftools/._bcf2qcall.c b/sam/bcftools/._bcf2qcall.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._bcf2qcall.c and /dev/null differ
diff --git a/sam/bcftools/._bcfutils.c b/sam/bcftools/._bcfutils.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._bcfutils.c and /dev/null differ
diff --git a/sam/bcftools/._call1.c b/sam/bcftools/._call1.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._call1.c and /dev/null differ
diff --git a/sam/bcftools/._em.c b/sam/bcftools/._em.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._em.c and /dev/null differ
diff --git a/sam/bcftools/._fet.c b/sam/bcftools/._fet.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._fet.c and /dev/null differ
diff --git a/sam/bcftools/._index.c b/sam/bcftools/._index.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._index.c and /dev/null differ
diff --git a/sam/bcftools/._kfunc.c b/sam/bcftools/._kfunc.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._kfunc.c and /dev/null differ
diff --git a/sam/bcftools/._kmin.c b/sam/bcftools/._kmin.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._kmin.c and /dev/null differ
diff --git a/sam/bcftools/._kmin.h b/sam/bcftools/._kmin.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._kmin.h and /dev/null differ
diff --git a/sam/bcftools/._main.c b/sam/bcftools/._main.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._main.c and /dev/null differ
diff --git a/sam/bcftools/._mut.c b/sam/bcftools/._mut.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._mut.c and /dev/null differ
diff --git a/sam/bcftools/._prob1.c b/sam/bcftools/._prob1.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._prob1.c and /dev/null differ
diff --git a/sam/bcftools/._prob1.h b/sam/bcftools/._prob1.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._prob1.h and /dev/null differ
diff --git a/sam/bcftools/._vcf.c b/sam/bcftools/._vcf.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/bcftools/._vcf.c and /dev/null differ
diff --git a/sam/bcftools/._vcfutils.pl b/sam/bcftools/._vcfutils.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/bcftools/._vcfutils.pl and /dev/null differ
diff --git a/sam/bcftools/Makefile b/sam/bcftools/Makefile
deleted file mode 100644
index be831de..0000000
--- a/sam/bcftools/Makefile
+++ /dev/null
@@ -1,51 +0,0 @@
-CC= gcc
-CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
-DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
-LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o
-OMISC= ..
-AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o
-PROG= bcftools
-INCLUDES=
-SUBDIRS= .
-
-.SUFFIXES:.c .o
-
-.c.o:
- $(CC) -c $(CFLAGS) $(DFLAGS) -I.. $(INCLUDES) $< -o $@
-
-all-recur lib-recur clean-recur cleanlocal-recur install-recur:
- @target=`echo $@ | sed s/-recur//`; \
- wdir=`pwd`; \
- list='$(SUBDIRS)'; for subdir in $$list; do \
- cd $$subdir; \
- $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
- INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
- cd $$wdir; \
- done;
-
-all:$(PROG)
-
-lib:libbcf.a
-
-libbcf.a:$(LOBJS)
- $(AR) -csru $@ $(LOBJS)
-
-bcftools:lib $(AOBJS)
- $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz -lpthread
-
-bcf.o:bcf.h
-vcf.o:bcf.h
-index.o:bcf.h
-bcfutils.o:bcf.h
-prob1.o:prob1.h bcf.h
-call1.o:prob1.h bcf.h
-bcf2qcall.o:bcf.h
-main.o:bcf.h
-
-bcf.pdf:bcf.tex
- pdflatex bcf
-
-cleanlocal:
- rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so*
-
-clean:cleanlocal-recur
diff --git a/sam/bcftools/README b/sam/bcftools/README
deleted file mode 100644
index 1d7159d..0000000
--- a/sam/bcftools/README
+++ /dev/null
@@ -1,36 +0,0 @@
-The view command of bcftools calls variants, tests Hardy-Weinberg
-equilibrium (HWE), tests allele balances and estimates allele frequency.
-
-This command calls a site as a potential variant if P(ref|D,F) is below
-0.9 (controlled by the -p option), where D is data and F is the prior
-allele frequency spectrum (AFS).
-
-The view command performs two types of allele balance tests, both based
-on Fisher's exact test for 2x2 contingency tables with the row variable
-being reference allele or not. In the first table, the column variable
-is strand. Two-tail P-value is taken. We test if variant bases tend to
-come from one strand. In the second table, the column variable is
-whether a base appears in the first or the last 11bp of the read.
-One-tail P-value is taken. We test if variant bases tend to occur
-towards the end of reads, which is usually an indication of
-misalignment.
-
-Site allele frequency is estimated in two ways. In the first way, the
-frequency is esimated as \argmax_f P(D|f) under the assumption of
-HWE. Prior AFS is not used. In the second way, the frequency is
-estimated as the posterior expectation of allele counts \sum_k
-kP(k|D,F), dividied by the total number of haplotypes. HWE is not
-assumed, but the estimate depends on the prior AFS. The two estimates
-largely agree when the signal is strong, but may differ greatly on weak
-sites as in this case, the prior plays an important role.
-
-To test HWE, we calculate the posterior distribution of genotypes
-(ref-hom, het and alt-hom). Chi-square test is performed. It is worth
-noting that the model used here is prior dependent and assumes HWE,
-which is different from both models for allele frequency estimate. The
-new model actually yields a third estimate of site allele frequency.
-
-The estimate allele frequency spectrum is printed to stderr per 64k
-sites. The estimate is in fact only the first round of a EM
-procedure. The second model (not the model for HWE testing) is used to
-estimate the AFS.
\ No newline at end of file
diff --git a/sam/bcftools/bcf.c b/sam/bcftools/bcf.c
deleted file mode 100644
index 24728db..0000000
--- a/sam/bcftools/bcf.c
+++ /dev/null
@@ -1,396 +0,0 @@
-#include <string.h>
-#include <ctype.h>
-#include <stdio.h>
-#include "kstring.h"
-#include "bcf.h"
-
-bcf_t *bcf_open(const char *fn, const char *mode)
-{
- bcf_t *b;
- b = calloc(1, sizeof(bcf_t));
- if (strchr(mode, 'w')) {
- b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), mode);
- } else {
- b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode);
- }
- return b;
-}
-
-int bcf_close(bcf_t *b)
-{
- int ret;
- if (b == 0) return 0;
- ret = bgzf_close(b->fp);
- free(b);
- return ret;
-}
-
-int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h)
-{
- if (b == 0 || h == 0) return -1;
- bgzf_write(b->fp, "BCF\4", 4);
- bgzf_write(b->fp, &h->l_nm, 4);
- bgzf_write(b->fp, h->name, h->l_nm);
- bgzf_write(b->fp, &h->l_smpl, 4);
- bgzf_write(b->fp, h->sname, h->l_smpl);
- bgzf_write(b->fp, &h->l_txt, 4);
- bgzf_write(b->fp, h->txt, h->l_txt);
- bgzf_flush(b->fp);
- return 16 + h->l_nm + h->l_smpl + h->l_txt;
-}
-
-bcf_hdr_t *bcf_hdr_read(bcf_t *b)
-{
- uint8_t magic[4];
- bcf_hdr_t *h;
- if (b == 0) return 0;
- h = calloc(1, sizeof(bcf_hdr_t));
- bgzf_read(b->fp, magic, 4);
- bgzf_read(b->fp, &h->l_nm, 4);
- h->name = malloc(h->l_nm);
- bgzf_read(b->fp, h->name, h->l_nm);
- bgzf_read(b->fp, &h->l_smpl, 4);
- h->sname = malloc(h->l_smpl);
- bgzf_read(b->fp, h->sname, h->l_smpl);
- bgzf_read(b->fp, &h->l_txt, 4);
- h->txt = malloc(h->l_txt);
- bgzf_read(b->fp, h->txt, h->l_txt);
- bcf_hdr_sync(h);
- return h;
-}
-
-void bcf_hdr_destroy(bcf_hdr_t *h)
-{
- if (h == 0) return;
- free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns);
- free(h);
-}
-
-static inline char **cnt_null(int l, char *str, int *_n)
-{
- int n = 0;
- char *p, **list;
- *_n = 0;
- if (l == 0 || str == 0) return 0;
- for (p = str; p != str + l; ++p)
- if (*p == 0) ++n;
- *_n = n;
- list = calloc(n, sizeof(void*));
- list[0] = str;
- for (p = str, n = 1; p < str + l - 1; ++p)
- if (*p == 0) list[n++] = p + 1;
- return list;
-}
-
-int bcf_hdr_sync(bcf_hdr_t *b)
-{
- if (b == 0) return -1;
- if (b->ns) free(b->ns);
- if (b->sns) free(b->sns);
- if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
- else b->ns = 0, b->n_ref = 0;
- b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
- return 0;
-}
-
-int bcf_sync(bcf1_t *b)
-{
- char *p, *tmp[5];
- int i, n, n_smpl = b->n_smpl;
- ks_tokaux_t aux;
- // set ref, alt, flt, info, fmt
- b->ref = b->alt = b->flt = b->info = b->fmt = 0;
- for (p = b->str, n = 0; p < b->str + b->l_str; ++p) {
- if (*p == 0 && p+1 != b->str + b->l_str) {
- if (n == 5) {
- ++n;
- break;
- } else tmp[n++] = p + 1;
- }
- }
- if (n != 5) {
- fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos);
- return -1;
- }
- b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
- // set n_alleles
- if (*b->alt == 0) b->n_alleles = 1;
- else {
- for (p = b->alt, n = 1; *p; ++p)
- if (*p == ',') ++n;
- b->n_alleles = n + 1;
- }
- // set n_gi and gi[i].fmt
- for (p = b->fmt, n = 1; *p; ++p)
- if (*p == ':') ++n;
- if (n > b->m_gi) {
- int old_m = b->m_gi;
- b->m_gi = n;
- kroundup32(b->m_gi);
- b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t));
- memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t));
- }
- b->n_gi = n;
- for (p = kstrtok(b->fmt, ":", &aux), n = 0; p; p = kstrtok(0, 0, &aux))
- b->gi[n++].fmt = bcf_str2int(p, aux.p - p);
- // set gi[i].len
- for (i = 0; i < b->n_gi; ++i) {
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
- b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2;
- } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
- b->gi[i].len = 2;
- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) {
- b->gi[i].len = 1;
- } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
- b->gi[i].len = 4;
- } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
- b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4;
- }
- b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len);
- }
- return 0;
-}
-
-int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b)
-{
- int i, l = 0;
- if (b == 0) return -1;
- bgzf_write(bp->fp, &b->tid, 4);
- bgzf_write(bp->fp, &b->pos, 4);
- bgzf_write(bp->fp, &b->qual, 4);
- bgzf_write(bp->fp, &b->l_str, 4);
- bgzf_write(bp->fp, b->str, b->l_str);
- l = 12 + b->l_str;
- for (i = 0; i < b->n_gi; ++i) {
- bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
- l += b->gi[i].len * h->n_smpl;
- }
- return l;
-}
-
-int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b)
-{
- int i, l = 0;
- if (b == 0) return -1;
- if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1;
- b->n_smpl = h->n_smpl;
- bgzf_read(bp->fp, &b->pos, 4);
- bgzf_read(bp->fp, &b->qual, 4);
- bgzf_read(bp->fp, &b->l_str, 4);
- if (b->l_str > b->m_str) {
- b->m_str = b->l_str;
- kroundup32(b->m_str);
- b->str = realloc(b->str, b->m_str);
- }
- bgzf_read(bp->fp, b->str, b->l_str);
- l = 12 + b->l_str;
- if (bcf_sync(b) < 0) return -2;
- for (i = 0; i < b->n_gi; ++i) {
- bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
- l += b->gi[i].len * h->n_smpl;
- }
- return l;
-}
-
-int bcf_destroy(bcf1_t *b)
-{
- int i;
- if (b == 0) return -1;
- free(b->str);
- for (i = 0; i < b->m_gi; ++i)
- free(b->gi[i].data);
- free(b->gi);
- free(b);
- return 0;
-}
-
-static inline void fmt_str(const char *p, kstring_t *s)
-{
- if (*p == 0) kputc('.', s);
- else kputs(p, s);
-}
-
-void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s)
-{
- int i, j, x;
- s->l = 0;
- if (h->n_ref) kputs(h->ns[b->tid], s);
- else kputw(b->tid, s);
- kputc('\t', s);
- kputw(b->pos + 1, s); kputc('\t', s);
- fmt_str(b->str, s); kputc('\t', s);
- fmt_str(b->ref, s); kputc('\t', s);
- fmt_str(b->alt, s); kputc('\t', s);
- ksprintf(s, "%.3g", b->qual); kputc('\t', s);
- fmt_str(b->flt, s); kputc('\t', s);
- fmt_str(b->info, s);
- if (b->fmt[0]) {
- kputc('\t', s);
- fmt_str(b->fmt, s);
- }
- x = b->n_alleles * (b->n_alleles + 1) / 2;
- if (b->n_gi == 0) return;
- int iPL = -1;
- if ( b->n_alleles > 2 ) {
- for (i=0; i<b->n_gi; i++) {
- if ( b->gi[i].fmt == bcf_str2int("PL", 2) ) {
- iPL = i;
- break;
- }
- }
- }
- for (j = 0; j < h->n_smpl; ++j) {
- int ploidy = b->ploidy ? b->ploidy[j] : 2;
- kputc('\t', s);
- for (i = 0; i < b->n_gi; ++i) {
- if (i) kputc(':', s);
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
- uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
- int k;
- if ( ploidy==1 )
- for (k=0; k<b->n_alleles; k++)
- {
- if (k>0) kputc(',', s);
- kputw(d[(k+1)*(k+2)/2-1], s);
- }
- else
- for (k = 0; k < x; ++k) {
- if (k > 0) kputc(',', s);
- kputw(d[k], s);
- }
- } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
- kputw(((uint16_t*)b->gi[i].data)[j], s);
- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
- kputw(((uint8_t*)b->gi[i].data)[j], s);
- } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
- kputw(((int32_t*)b->gi[i].data)[j], s);
- } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
- int y = ((uint8_t*)b->gi[i].data)[j];
- if ( ploidy==1 )
- {
- if ( y>>7&1 )
- kputc('.', s);
- else
- kputc('0' + (y>>3&7), s);
- }
- else
- {
- if ( y>>7&1 )
- kputsn("./.", 3, s);
- else {
- kputc('0' + (y>>3&7), s);
- kputc("/|"[y>>6&1], s);
- kputc('0' + (y&7), s);
- }
- }
- } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
- float *d = (float*)b->gi[i].data + j * x;
- int k;
- //printf("- %lx\n", d);
- for (k = 0; k < x; ++k) {
- if (k > 0) kputc(',', s);
- ksprintf(s, "%.2f", d[k]);
- }
- } else kputc('.', s); // custom fields
- }
- }
-}
-
-char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b)
-{
- kstring_t s;
- s.l = s.m = 0; s.s = 0;
- bcf_fmt_core(h, b, &s);
- return s.s;
-}
-
-int bcf_append_info(bcf1_t *b, const char *info, int l)
-{
- int shift = b->fmt - b->str;
- int l_fmt = b->l_str - shift;
- char *ori = b->str;
- if (b->l_str + l > b->m_str) { // enlarge if necessary
- b->m_str = b->l_str + l;
- kroundup32(b->m_str);
- b->str = realloc(b->str, b->m_str);
- }
- memmove(b->str + shift + l, b->str + shift, l_fmt); // move the FORMAT field
- memcpy(b->str + shift - 1, info, l); // append to the INFO field
- b->str[shift + l - 1] = '\0';
- b->fmt = b->str + shift + l;
- b->l_str += l;
- if (ori != b->str) bcf_sync(b); // synchronize when realloc changes the pointer
- return 0;
-}
-
-int remove_tag(char *str, const char *tag, char delim)
-{
- char *tmp = str, *p;
- int len_diff = 0, ori_len = strlen(str);
- while ( *tmp && (p = strstr(tmp,tag)) )
- {
- if ( p>str )
- {
- if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring
- p--;
- }
- char *q=p+1;
- while ( *q && *q!=delim ) q++;
- if ( p==str && *q ) q++; // the tag is first, don't move the delim char
- len_diff += q-p;
- if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows
- else
- memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim
- }
- if ( len_diff==ori_len )
- str[0]='.', str[1]=0, len_diff--;
-
- return len_diff;
-}
-
-
-void rm_info(kstring_t *s, const char *key)
-{
- char *p = s->s;
- int n = 0;
- while ( n<4 )
- {
- if ( !*p ) n++;
- p++;
- }
- char *q = p+1;
- while ( *q && q-s->s<s->l ) q++;
-
- int nrm = remove_tag(p, key, ';');
- if ( nrm )
- memmove(q-nrm, q, s->s+s->l-q+1);
- s->l -= nrm;
-}
-
-int bcf_cpy(bcf1_t *r, const bcf1_t *b)
-{
- char *t1 = r->str;
- bcf_ginfo_t *t2 = r->gi;
- int i, t3 = r->m_str, t4 = r->m_gi;
- *r = *b;
- r->str = t1; r->gi = t2; r->m_str = t3; r->m_gi = t4;
- if (r->m_str < b->m_str) {
- r->m_str = b->m_str;
- r->str = realloc(r->str, r->m_str);
- }
- memcpy(r->str, b->str, r->m_str);
- bcf_sync(r); // calling bcf_sync() is simple but inefficient
- for (i = 0; i < r->n_gi; ++i)
- memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len);
- return 0;
-}
-
-int bcf_is_indel(const bcf1_t *b)
-{
- char *p;
- if (strlen(b->ref) > 1) return 1;
- for (p = b->alt; *p; ++p)
- if (*p != ',' && p[1] != ',' && p[1] != '\0')
- return 1;
- return 0;
-}
diff --git a/sam/bcftools/bcf.h b/sam/bcftools/bcf.h
deleted file mode 100644
index f722525..0000000
--- a/sam/bcftools/bcf.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2010 Broad Institute
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <***@live.co.uk> */
-
-#ifndef BCF_H
-#define BCF_H
-
-#define BCF_VERSION "0.1.19-44428cd"
-
-#include <stdint.h>
-#include <zlib.h>
-
-#ifndef BCF_LITE
-#include "bgzf.h"
-typedef BGZF *bcfFile;
-#else
-typedef gzFile bcfFile;
-#define bgzf_open(fn, mode) gzopen(fn, mode)
-#define bgzf_fdopen(fd, mode) gzdopen(fd, mode)
-#define bgzf_close(fp) gzclose(fp)
-#define bgzf_read(fp, buf, len) gzread(fp, buf, len)
-#define bgzf_write(fp, buf, len)
-#define bgzf_flush(fp)
-#endif
-
-/*
- A member in the structs below is said to "primary" if its content
- cannot be inferred from other members in any of structs below; a
- member is said to be "derived" if its content can be derived from
- other members. For example, bcf1_t::str is primary as this comes from
- the input data, while bcf1_t::info is derived as it can always be
- correctly set if we know bcf1_t::str. Derived members are for quick
- access to the content and must be synchronized with the primary data.
- */
-
-typedef struct {
- uint32_t fmt; // format of the block, set by bcf_str2int().
- int len; // length of data for each individual
- void *data; // concatenated data
- // derived info: fmt, len (<-bcf1_t::fmt)
-} bcf_ginfo_t;
-
-typedef struct {
- int32_t tid, pos; // refID and 0-based position
- int32_t l_str, m_str; // length and the allocated size of ->str
- float qual; // SNP quality
- char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
- char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
- int n_gi, m_gi; // number and the allocated size of geno fields
- bcf_ginfo_t *gi; // array of geno fields
- int n_alleles, n_smpl; // number of alleles and samples
- // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
- uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed.
-} bcf1_t;
-
-typedef struct {
- int32_t n_ref, n_smpl; // number of reference sequences and samples
- int32_t l_nm; // length of concatenated sequence names; 0 padded
- int32_t l_smpl; // length of concatenated sample names; 0 padded
- int32_t l_txt; // length of header text (lines started with ##)
- char *name, *sname, *txt; // concatenated sequence names, sample names and header text
- char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
- // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
-} bcf_hdr_t;
-
-typedef struct {
- int is_vcf; // if the file in operation is a VCF
- void *v; // auxillary data structure for VCF
- bcfFile fp; // file handler for BCF
-} bcf_t;
-
-struct __bcf_idx_t;
-typedef struct __bcf_idx_t bcf_idx_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- // open a BCF file; for BCF file only
- bcf_t *bcf_open(const char *fn, const char *mode);
- // close file
- int bcf_close(bcf_t *b);
- // read one record from BCF; return -1 on end-of-file, and <-1 for errors
- int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
- // call this function if b->str is changed
- int bcf_sync(bcf1_t *b);
- // write a BCF record
- int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
- // read the BCF header; BCF only
- bcf_hdr_t *bcf_hdr_read(bcf_t *b);
- // write the BCF header
- int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
- // set bcf_hdr_t::ns and bcf_hdr_t::sns
- int bcf_hdr_sync(bcf_hdr_t *b);
- // destroy the header
- void bcf_hdr_destroy(bcf_hdr_t *h);
- // destroy a record
- int bcf_destroy(bcf1_t *b);
- // BCF->VCF conversion
- char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
- // append more info
- int bcf_append_info(bcf1_t *b, const char *info, int l);
- // remove tag
- int remove_tag(char *string, const char *tag, char delim);
- // remove info tag, string is the kstring holder of bcf1_t.str
- void rm_info(kstring_t *string, const char *key);
- // copy
- int bcf_cpy(bcf1_t *r, const bcf1_t *b);
-
- // open a VCF or BCF file if "b" is set in "mode"
- bcf_t *vcf_open(const char *fn, const char *mode);
- // close a VCF/BCF file
- int vcf_close(bcf_t *bp);
- // read the VCF/BCF header
- bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
- // read the sequence dictionary from a separate file; required for VCF->BCF conversion
- int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn);
- // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
- int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
- // write the VCF header
- int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
- // write a VCF record
- int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
-
- // keep the first n alleles and discard the rest
- int bcf_shrink_alt(bcf1_t *b, int n);
- // keep the masked alleles and discard the rest
- void bcf_fit_alt(bcf1_t *b, int mask);
- // convert GL to PL
- int bcf_gl2pl(bcf1_t *b);
- // if the site is an indel
- int bcf_is_indel(const bcf1_t *b);
- bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list);
- int bcf_subsam(int n_smpl, int *list, bcf1_t *b);
- // move GT to the first FORMAT field
- int bcf_fix_gt(bcf1_t *b);
- // update PL generated by old samtools
- int bcf_fix_pl(bcf1_t *b);
- // convert PL to GLF-like 10-likelihood GL
- int bcf_gl10(const bcf1_t *b, uint8_t *gl);
- // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL
- int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl);
-
- // string hash table
- void *bcf_build_refhash(bcf_hdr_t *h);
- void bcf_str2id_destroy(void *_hash);
- void bcf_str2id_thorough_destroy(void *_hash);
- int bcf_str2id_add(void *_hash, const char *str);
- int bcf_str2id(void *_hash, const char *str);
- void *bcf_str2id_init();
-
- // indexing related functions
- int bcf_idx_build(const char *fn);
- uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
- int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
- bcf_idx_t *bcf_idx_load(const char *fn);
- void bcf_idx_destroy(bcf_idx_t *idx);
-
-#ifdef __cplusplus
-}
-#endif
-
-static inline uint32_t bcf_str2int(const char *str, int l)
-{
- int i;
- uint32_t x = 0;
- for (i = 0; i < l && i < 4; ++i) {
- if (str[i] == 0) return x;
- x = x<<8 | str[i];
- }
- return x;
-}
-
-#endif
diff --git a/sam/bcftools/bcf.tex b/sam/bcftools/bcf.tex
deleted file mode 100644
index 442fc2a..0000000
--- a/sam/bcftools/bcf.tex
+++ /dev/null
@@ -1,77 +0,0 @@
-\documentclass[10pt,pdftex]{article}
-\usepackage{color}
-\definecolor{gray}{rgb}{0.7,0.7,0.7}
-
-\setlength{\topmargin}{0.0cm}
-\setlength{\textheight}{21.5cm}
-\setlength{\oddsidemargin}{0cm}
-\setlength{\textwidth}{16.5cm}
-\setlength{\columnsep}{0.6cm}
-
-\begin{document}
-
-\begin{center}
-\begin{tabular}{|l|l|l|l|l|}
-\hline
-\multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline
-\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline
-\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline
-\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline
-\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline
-\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline
-\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline
-\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline
-\multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5}
-& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5}
-& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5}
-& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5}
-& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5}
-& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5}
-& \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\
-\hline
-\end{tabular}
-\end{center}
-
-\begin{center}
-\begin{tabular}{clp{9cm}}
-\hline
-\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline
-{\tt DP} & {\tt uint16\_t[n]} & Read depth \\
-{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\
-{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\
-{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set,
- the allele is not present (e.g. due to different ploidy between samples).} \\
-{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\
-{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\
-{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\
-{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\
-{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\
-{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\
-{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\
-%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\
-\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\
-\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\
-\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\
-\hline
-\end{tabular}
-\end{center}
-
-\begin{itemize}
-\item A BCF file is in the {\tt BGZF} format.
-\item All multi-byte numbers are little-endian.
-\item In a string, a missing value `.' is an empty C string ``{\tt
- \char92 0}'' (not ``{\tt .\char92 0}'')
-\item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the
- order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt
- REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt
- CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original
- BCF proposal).
-\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields
- are required to be explicitly defined in the headers.
-\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only.
- It gives an alternative binary representation of the corresponding VCF field, in case
- the default representation is unable to keep the genotype information,
- for example, when the ploidy is not 2 or there are more than 8 alleles.
-\end{itemize}
-
-\end{document}
diff --git a/sam/bcftools/bcf2qcall.c b/sam/bcftools/bcf2qcall.c
deleted file mode 100644
index a86bac2..0000000
--- a/sam/bcftools/bcf2qcall.c
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <errno.h>
-#include <math.h>
-#include <string.h>
-#include <stdlib.h>
-#include "bcf.h"
-
-static int8_t nt4_table[256] = {
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
-};
-
-static int read_I16(bcf1_t *b, int anno[16])
-{
- char *p;
- int i;
- if ((p = strstr(b->info, "I16=")) == 0) return -1;
- p += 4;
- for (i = 0; i < 16; ++i) {
- anno[i] = strtol(p, &p, 10);
- if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2;
- ++p;
- }
- return 0;
-}
-
-int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b)
-{
- int a[4], k, g[10], l, map[4], k1, j, i, i0, anno[16], dp, mq, d_rest;
- char *s;
- if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
- if (i == b->n_gi) return -1; // no PL
- if (read_I16(b, anno) != 0) return -1; // no I16; FIXME: can be improved
- d_rest = dp = anno[0] + anno[1] + anno[2] + anno[3];
- if (dp == 0) return -1; // depth is zero
- mq = (int)(sqrt((double)(anno[9] + anno[11]) / dp) + .499);
- i0 = i;
- a[0] = nt4_table[(int)b->ref[0]];
- if (a[0] > 3) return -1; // ref is not A/C/G/T
- a[1] = a[2] = a[3] = -2; // -1 has a special meaning
- if (b->alt[0] == 0) return -1; // no alternate allele
- map[0] = map[1] = map[2] = map[3] = -2;
- map[a[0]] = 0;
- for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
- if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
- a[k+1] = nt4_table[(int)*s];
- if (a[k+1] >= 0) map[a[k+1]] = k+1;
- else k1 = k+1;
- if (s[1] == 0) break;
- }
- for (k = 0; k < 4; ++k)
- if (map[k] < 0) map[k] = k1;
- for (i = 0; i < h->n_smpl; ++i) {
- int d;
- uint8_t *p = b->gi[i0].data + i * b->gi[i0].len;
- for (j = 0; j < b->gi[i0].len; ++j)
- if (p[j]) break;
- d = (int)((double)d_rest / (h->n_smpl - i) + .499);
- if (d == 0) d = 1;
- if (j == b->gi[i0].len) d = 0;
- d_rest -= d;
- for (k = j = 0; k < 4; ++k) {
- for (l = k; l < 4; ++l) {
- int t, x = map[k], y = map[l];
- if (x > y) t = x, x = y, y = t; // swap
- g[j++] = p[y * (y+1) / 2 + x];
- }
- }
- printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref);
- printf("\t%d\t%d\t0", d, mq);
- for (j = 0; j < 10; ++j)
- printf("\t%d", g[j]);
- printf("\t%s\n", h->sns[i]);
- }
- return 0;
-}
diff --git a/sam/bcftools/bcfutils.c b/sam/bcftools/bcfutils.c
deleted file mode 100644
index 7638085..0000000
--- a/sam/bcftools/bcfutils.c
+++ /dev/null
@@ -1,504 +0,0 @@
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "bcf.h"
-#include "kstring.h"
-#include "khash.h"
-KHASH_MAP_INIT_STR(str2id, int)
-
-#ifdef _WIN32
-#define srand48(x) srand(x)
-#define drand48() ((double)rand() / RAND_MAX)
-#endif
-
-// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
-void *bcf_build_refhash(bcf_hdr_t *h)
-{
- khash_t(str2id) *hash;
- int i, ret;
- hash = kh_init(str2id);
- for (i = 0; i < h->n_ref; ++i) {
- khint_t k;
- k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
- kh_val(hash, k) = i;
- }
- return hash;
-}
-
-void *bcf_str2id_init()
-{
- return kh_init(str2id);
-}
-
-void bcf_str2id_destroy(void *_hash)
-{
- khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
- if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
-}
-
-void bcf_str2id_thorough_destroy(void *_hash)
-{
- khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
- khint_t k;
- if (hash == 0) return;
- for (k = 0; k < kh_end(hash); ++k)
- if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
- kh_destroy(str2id, hash);
-}
-
-int bcf_str2id(void *_hash, const char *str)
-{
- khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
- khint_t k;
- if (!hash) return -1;
- k = kh_get(str2id, hash, str);
- return k == kh_end(hash)? -1 : kh_val(hash, k);
-}
-
-int bcf_str2id_add(void *_hash, const char *str)
-{
- khint_t k;
- int ret;
- khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
- if (!hash) return -1;
- k = kh_put(str2id, hash, str, &ret);
- if (ret == 0) return kh_val(hash, k);
- kh_val(hash, k) = kh_size(hash) - 1;
- return kh_val(hash, k);
-}
-
-void bcf_fit_alt(bcf1_t *b, int mask)
-{
- mask |= 1; // REF must be always present
-
- int i,j,nals=0;
- for (i=0; i<sizeof(int); i++)
- if ( mask&1<<i) nals++;
- if ( b->n_alleles <= nals ) return;
-
- // update ALT, in principle any of the alleles can be removed
- char *p;
- if ( nals>1 )
- {
- char *dst, *src;
- int n=0, nalts=nals-1;
- for (src=dst=p=b->alt, i=1; *p; p++)
- {
- if ( *p!=',' ) continue;
-
- if ( mask&1<<i )
- {
- n++;
- if ( src!=dst )
- {
- memmove(dst,src,p-src);
- dst += p-src;
- }
- else dst = p;
- if ( n<nalts ) { *dst=','; dst++; }
- }
- i++;
-
- if ( n>=nalts ) { *dst=0; break; }
- src = p+1;
- }
- if ( n<nalts )
- {
- memmove(dst,src,p-src);
- dst += p-src;
- *dst = 0;
- }
- p = dst;
- }
- else p = b->alt, *p = '\0';
- p++;
- memmove(p, b->flt, b->str + b->l_str - b->flt);
- b->l_str -= b->flt - p;
-
- // update PL and GT
- int ipl=-1, igt=-1;
- for (i = 0; i < b->n_gi; ++i)
- {
- bcf_ginfo_t *g = b->gi + i;
- if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
- if (g->fmt == bcf_str2int("GT", 2)) igt = i;
- }
-
- // .. create mapping between old and new indexes
- int npl = nals * (nals+1) / 2;
- int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
- int kori=0,knew=0;
- for (i=0; i<b->n_alleles; i++)
- {
- for (j=0; j<=i; j++)
- {
- int skip=0;
- if ( i && !(mask&1<<i) ) skip=1;
- if ( j && !(mask&1<<j) ) skip=1;
- if ( !skip ) { map[knew++] = kori; }
- kori++;
- }
- }
- // .. apply to all samples
- int n_smpl = b->n_smpl;
- for (i = 0; i < b->n_gi; ++i)
- {
- bcf_ginfo_t *g = b->gi + i;
- if (g->fmt == bcf_str2int("PL", 2))
- {
- g->len = npl;
- uint8_t *d = (uint8_t*)g->data;
- int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
- for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
- {
- uint8_t *dl = d + ismpl * npl_ori;
- for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
- }
- } // FIXME: to add GL
- }
- // update GTs
- map[0] = 0;
- for (i=1, knew=0; i<b->n_alleles; i++)
- map[i] = mask&1<<i ? ++knew : -1;
- for (i=0; i<n_smpl; i++)
- {
- uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
- int a1 = (gt>>3)&7;
- int a2 = gt&7;
- assert( map[a1]>=0 && map[a2]>=0 );
- ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)&gt) | map[a1]<<3 | map[a2];
- }
- free(map);
- b->n_alleles = nals;
- bcf_sync(b);
-}
-
-int bcf_shrink_alt(bcf1_t *b, int n)
-{
- char *p;
- int i, j, k, n_smpl = b->n_smpl;
- if (b->n_alleles <= n) return -1;
- // update ALT
- if (n > 1) {
- for (p = b->alt, k = 1; *p; ++p)
- if (*p == ',' && ++k == n) break;
- *p = '\0';
- } else p = b->alt, *p = '\0';
- ++p;
- memmove(p, b->flt, b->str + b->l_str - b->flt);
- b->l_str -= b->flt - p;
- // update PL
- for (i = 0; i < b->n_gi; ++i) {
- bcf_ginfo_t *g = b->gi + i;
- if (g->fmt == bcf_str2int("PL", 2)) {
- int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
- uint8_t *d = (uint8_t*)g->data;
- g->len = n * (n + 1) / 2;
- for (l = k = 0; l < n_smpl; ++l) {
- uint8_t *dl = d + l * x;
- for (j = 0; j < g->len; ++j) d[k++] = dl[j];
- }
- } // FIXME: to add GL
- }
- b->n_alleles = n;
- bcf_sync(b);
- return 0;
-}
-
-int bcf_gl2pl(bcf1_t *b)
-{
- char *p;
- int i, n_smpl = b->n_smpl;
- bcf_ginfo_t *g;
- float *d0;
- uint8_t *d1;
- if (strstr(b->fmt, "PL")) return -1;
- if ((p = strstr(b->fmt, "GL")) == 0) return -1;
- *p = 'P';
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("GL", 2))
- break;
- g = b->gi + i;
- g->fmt = bcf_str2int("PL", 2);
- g->len /= 4; // 4 == sizeof(float)
- d0 = (float*)g->data; d1 = (uint8_t*)g->data;
- for (i = 0; i < n_smpl * g->len; ++i) {
- int x = (int)(-10. * d0[i] + .499);
- if (x > 255) x = 255;
- if (x < 0) x = 0;
- d1[i] = x;
- }
- return 0;
-}
-/* FIXME: this function will fail given AB:GTX:GT. BCFtools never
- * produces such FMT, but others may do. */
-int bcf_fix_gt(bcf1_t *b)
-{
- char *s;
- int i;
- uint32_t tmp;
- bcf_ginfo_t gt;
- // check the presence of the GT FMT
- if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
- assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact
- tmp = bcf_str2int("GT", 2);
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == tmp) break;
- if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
- gt = b->gi[i];
- // move GT to the first
- for (; i > 0; --i) b->gi[i] = b->gi[i-1];
- b->gi[0] = gt;
- if ( s[3]==0 )
- memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT
- else
- memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT:
- b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
- return 0;
-}
-
-int bcf_fix_pl(bcf1_t *b)
-{
- int i;
- uint32_t tmp;
- uint8_t *PL, *swap;
- bcf_ginfo_t *gi;
- // pinpoint PL
- tmp = bcf_str2int("PL", 2);
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == tmp) break;
- if (i == b->n_gi) return 0;
- // prepare
- gi = b->gi + i;
- PL = (uint8_t*)gi->data;
- swap = alloca(gi->len);
- // loop through individuals
- for (i = 0; i < b->n_smpl; ++i) {
- int k, l, x;
- uint8_t *PLi = PL + i * gi->len;
- memcpy(swap, PLi, gi->len);
- for (k = x = 0; k < b->n_alleles; ++k)
- for (l = k; l < b->n_alleles; ++l)
- PLi[l*(l+1)/2 + k] = swap[x++];
- }
- return 0;
-}
-
-int bcf_smpl_covered(const bcf1_t *b)
-{
- int i, j, n = 0;
- uint32_t tmp;
- bcf_ginfo_t *gi;
- // pinpoint PL
- tmp = bcf_str2int("PL", 2);
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == tmp) break;
- if (i == b->n_gi) return 0;
- // count how many samples having PL!=[0..0]
- gi = b->gi + i;
- for (i = 0; i < b->n_smpl; ++i) {
- uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
- for (j = 0; j < gi->len; ++j)
- if (PLi[j]) break;
- if (j < gi->len) ++n;
- }
- return n;
-}
-
-static void *locate_field(const bcf1_t *b, const char *fmt, int l)
-{
- int i;
- uint32_t tmp;
- tmp = bcf_str2int(fmt, l);
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == tmp) break;
- return i == b->n_gi? 0 : b->gi[i].data;
-}
-
-int bcf_anno_max(bcf1_t *b)
-{
- int k, max_gq, max_sp, n_het;
- kstring_t str;
- uint8_t *gt, *gq;
- int32_t *sp;
- max_gq = max_sp = n_het = 0;
- gt = locate_field(b, "GT", 2);
- if (gt == 0) return -1;
- gq = locate_field(b, "GQ", 2);
- sp = locate_field(b, "SP", 2);
- if (sp)
- for (k = 0; k < b->n_smpl; ++k)
- if (gt[k]&0x3f)
- max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
- if (gq)
- for (k = 0; k < b->n_smpl; ++k)
- if (gt[k]&0x3f)
- max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
- for (k = 0; k < b->n_smpl; ++k) {
- int a1, a2;
- a1 = gt[k]&7; a2 = gt[k]>>3&7;
- if ((!a1 && a2) || (!a2 && a1)) { // a het
- if (gq == 0) ++n_het;
- else if (gq[k] >= 20) ++n_het;
- }
- }
- if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
- if (max_sp < 0) max_sp = 0;
- memset(&str, 0, sizeof(kstring_t));
- if (*b->info) kputc(';', &str);
- ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
- bcf_append_info(b, str.s, str.l);
- free(str.s);
- return 0;
-}
-
-// FIXME: only data are shuffled; the header is NOT
-int bcf_shuffle(bcf1_t *b, int seed)
-{
- int i, j, *a;
- if (seed > 0) srand48(seed);
- a = malloc(b->n_smpl * sizeof(int));
- for (i = 0; i < b->n_smpl; ++i) a[i] = i;
- for (i = b->n_smpl; i > 1; --i) {
- int tmp;
- j = (int)(drand48() * i);
- tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
- }
- for (j = 0; j < b->n_gi; ++j) {
- bcf_ginfo_t *gi = b->gi + j;
- uint8_t *swap, *data = (uint8_t*)gi->data;
- swap = malloc(gi->len * b->n_smpl);
- for (i = 0; i < b->n_smpl; ++i)
- memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
- free(gi->data);
- gi->data = swap;
- }
- free(a);
- return 0;
-}
-
-bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
-{
- int i, ret, j;
- khint_t k;
- bcf_hdr_t *h;
- khash_t(str2id) *hash;
- kstring_t s;
- s.l = s.m = 0; s.s = 0;
- hash = kh_init(str2id);
- for (i = 0; i < h0->n_smpl; ++i) {
- k = kh_put(str2id, hash, h0->sns[i], &ret);
- kh_val(hash, k) = i;
- }
- for (i = j = 0; i < n; ++i) {
- k = kh_get(str2id, hash, samples[i]);
- if (k != kh_end(hash)) {
- list[j++] = kh_val(hash, k);
- kputs(samples[i], &s); kputc('\0', &s);
- }
- }
- if (j < n)
- {
- fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
- exit(1);
- }
- kh_destroy(str2id, hash);
- h = calloc(1, sizeof(bcf_hdr_t));
- *h = *h0;
- h->ns = 0; h->sns = 0;
- h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
- h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
- h->l_smpl = s.l; h->sname = s.s;
- bcf_hdr_sync(h);
- return h;
-}
-
-int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
-{
- int i, j;
- for (j = 0; j < b->n_gi; ++j) {
- bcf_ginfo_t *gi = b->gi + j;
- uint8_t *swap;
- swap = malloc(gi->len * b->n_smpl);
- for (i = 0; i < n_smpl; ++i)
- memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
- free(gi->data);
- gi->data = swap;
- }
- b->n_smpl = n_smpl;
- return 0;
-}
-
-static int8_t nt4_table[128] = {
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
-};
-
-int bcf_gl10(const bcf1_t *b, uint8_t *gl)
-{
- int a[4], k, l, map[4], k1, j, i;
- const bcf_ginfo_t *PL;
- char *s;
- if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
- if (i == b->n_gi) return -1; // no PL
- PL = b->gi + i;
- a[0] = nt4_table[(int)b->ref[0]];
- if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
- a[1] = a[2] = a[3] = -2; // -1 has a special meaning
- if (b->alt[0] == 0) return -1; // no alternate allele
- map[0] = map[1] = map[2] = map[3] = -2;
- map[a[0]] = 0;
- for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
- if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
- a[k+1] = nt4_table[(int)*s];
- if (a[k+1] >= 0) map[a[k+1]] = k+1;
- else k1 = k + 1;
- if (s[1] == 0) break; // the end of the ALT string
- }
- for (k = 0; k < 4; ++k)
- if (map[k] < 0) map[k] = k1;
- for (i = 0; i < b->n_smpl; ++i) {
- const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
- uint8_t *g = gl + 10 * i;
- for (k = j = 0; k < 4; ++k) {
- for (l = k; l < 4; ++l) {
- int t, x = map[k], y = map[l];
- if (x > y) t = x, x = y, y = t; // make sure x is the smaller
- g[j++] = p[y * (y+1) / 2 + x];
- }
- }
- }
- return 0;
-}
-
-int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
-{
- int k, l, j, i;
- const bcf_ginfo_t *PL;
- if (b->alt[0] == 0) return -1; // no alternate allele
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
- if (i == b->n_gi) return -1; // no PL
- PL = b->gi + i;
- for (i = 0; i < b->n_smpl; ++i) {
- const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
- uint8_t *g = gl + 10 * i;
- for (k = j = 0; k < 4; ++k) {
- for (l = k; l < 4; ++l) {
- int t, x = k, y = l;
- if (x > y) t = x, x = y, y = t; // make sure x is the smaller
- x = y * (y+1) / 2 + x;
- g[j++] = x < PL->len? p[x] : 255;
- }
- }
- }
- return 0;
-}
diff --git a/sam/bcftools/call1.c b/sam/bcftools/call1.c
deleted file mode 100644
index e6373d3..0000000
--- a/sam/bcftools/call1.c
+++ /dev/null
@@ -1,633 +0,0 @@
-#include <unistd.h>
-#include <stdlib.h>
-#include <math.h>
-#include <zlib.h>
-#include <errno.h>
-#include "bcf.h"
-#include "prob1.h"
-#include "kstring.h"
-#include "time.h"
-
-#ifdef _WIN32
-#define srand48(x) srand(x)
-#define lrand48() rand()
-#endif
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 16384)
-
-#define VC_NO_GENO 2
-#define VC_BCFOUT 4
-#define VC_CALL 8
-#define VC_VARONLY 16
-#define VC_VCFIN 32
-#define VC_UNCOMP 64
-#define VC_KEEPALT 256
-#define VC_ACGT_ONLY 512
-#define VC_QCALL 1024
-#define VC_CALL_GT 2048
-#define VC_ADJLD 4096
-#define VC_NO_INDEL 8192
-#define VC_ANNO_MAX 16384
-#define VC_FIX_PL 32768
-#define VC_EM 0x10000
-#define VC_PAIRCALL 0x20000
-#define VC_QCNT 0x40000
-#define VC_INDEL_ONLY 0x80000
-
-typedef struct {
- int flag, prior_type, n1, n_sub, *sublist, n_perm;
- uint32_t *trio_aux;
- char *prior_file, **subsam, *fn_dict;
- uint8_t *ploidy;
- double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt;
- void *bed;
-} viewconf_t;
-
-void *bed_read(const char *fn);
-void bed_destroy(void *_h);
-int bed_overlap(const void *_h, const char *chr, int beg, int end);
-
-static double ttest(int n1, int n2, int a[4])
-{
- extern double kf_betai(double a, double b, double x);
- double t, v, u1, u2;
- if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
- u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
- if (u1 <= u2) return 1.;
- t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
- v = n1 + n2 - 2;
-// printf("%d,%d,%d,%d,%lf,%lf,%lf\n", a[0], a[1], a[2], a[3], t, u1, u2);
- return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
-}
-
-static int test16_core(int anno[16], anno16_t *a)
-{
- extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
- double left, right;
- int i;
- a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
- memcpy(a->d, anno, 4 * sizeof(int));
- a->depth = anno[0] + anno[1] + anno[2] + anno[3];
- a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
- if (a->depth == 0) return -1;
- a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
- kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
- for (i = 1; i < 4; ++i)
- a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
- return 0;
-}
-
-int test16(bcf1_t *b, anno16_t *a)
-{
- char *p;
- int i, anno[16];
- a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
- a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
- a->mq = a->depth = a->is_tested = 0;
- if ((p = strstr(b->info, "I16=")) == 0) return -1;
- p += 4;
- for (i = 0; i < 16; ++i) {
- errno = 0; anno[i] = strtol(p, &p, 10);
- if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2;
- ++p;
- }
- return test16_core(anno, a);
-}
-
-static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt)
-{
- kstring_t s;
- int has_I16, is_var;
- double fq, r;
- anno16_t a;
-
- has_I16 = test16(b, &a) >= 0? 1 : 0;
- //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed!
-
- memset(&s, 0, sizeof(kstring_t));
- kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
- kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
- kputs(b->info, &s);
- if (b->info[0]) kputc(';', &s);
- { // print EM
- if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]);
- if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]);
- if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
- if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
- if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
- }
- if (cons_llr > 0) {
- ksprintf(&s, ";CLR=%d", cons_llr);
- if (cons_gt > 0)
- ksprintf(&s, ";UGT=%c%c%c;CGT=%c%c%c", cons_gt&0xff, cons_gt>>8&0xff, cons_gt>>16&0xff,
- cons_gt>>32&0xff, cons_gt>>40&0xff, cons_gt>>48&0xff);
- }
- if (pr == 0) { // if pr is unset, return
- kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s);
- free(b->str);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- bcf_sync(b);
- return 1;
- }
-
- is_var = (pr->p_ref < pref);
- r = is_var? pr->p_ref : pr->p_var;
-
-// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted!
- ksprintf(&s, ";AC1=%d", pr->ac);
- if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
- fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
- if (fq < -999) fq = -999;
- if (fq > 999) fq = 999;
- ksprintf(&s, ";FQ=%.3g", fq);
- if (pr->cmp[0] >= 0.) { // two sample groups
- int i, q[3];
- for (i = 1; i < 3; ++i) {
- double x = pr->cmp[i] + pr->cmp[0]/2.;
- q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
- if (q[i] > 255) q[i] = 255;
- }
- if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
- // ksprintf(&s, ";LRT3=%.3g", pr->lrt);
- ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2);
- }
- if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
- kputc('\0', &s);
- rm_info(&s, "QS=");
- rm_info(&s, "I16=");
- kputs(b->fmt, &s); kputc('\0', &s);
- free(b->str);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- b->qual = r < 1e-100? 999 : -4.343 * log(r);
- if (b->qual > 999) b->qual = 999;
- bcf_sync(b);
- if (!is_var) bcf_shrink_alt(b, 1);
- else if (!(flag&VC_KEEPALT))
- bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1);
- if (is_var && (flag&VC_CALL_GT)) { // call individual genotype
- int i, x, old_n_gi = b->n_gi;
- s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
- kputs(":GT:GQ", &s); kputc('\0', &s);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- bcf_sync(b);
- for (i = 0; i < b->n_smpl; ++i) {
- x = bcf_p1_call_gt(pa, pr->f_exp, i);
- ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0;
- ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2;
- }
- }
- return is_var;
-}
-
-static char **read_samples(const char *fn, int *_n)
-{
- gzFile fp;
- kstream_t *ks;
- kstring_t s;
- int dret, n = 0, max = 0;
- char **sam = 0;
- *_n = 0;
- s.l = s.m = 0; s.s = 0;
- fp = gzopen(fn, "r");
- if (fp == 0)
- {
- // interpret as sample names, not as a file name
- const char *t = fn, *p = t;
- while (*t)
- {
- t++;
- if ( *t==',' || !*t )
- {
- sam = realloc(sam, sizeof(void*)*(n+1));
- sam[n] = (char*) malloc(sizeof(char)*(t-p+2));
- memcpy(sam[n], p, t-p);
- sam[n][t-p] = 0;
- sam[n][t-p+1] = 2; // assume diploid
- p = t+1;
- n++;
- }
- }
- *_n = n;
- return sam; // fail to open file
- }
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
- int l;
- if (max == n) {
- max = max? max<<1 : 4;
- sam = realloc(sam, sizeof(void*)*max);
- }
- l = s.l;
- sam[n] = malloc(s.l + 2);
- strcpy(sam[n], s.s);
- sam[n][l+1] = 2; // by default, diploid
- if (dret != '\n') {
- if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2
- int x = (int)s.s[0] - '0';
- if (x == 1 || x == 2) sam[n][l+1] = x;
- else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__);
- }
- if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
- }
- ++n;
- }
- ks_destroy(ks);
- gzclose(fp);
- free(s.s);
- *_n = n;
- return sam;
-}
-
-static void write_header(bcf_hdr_t *h)
-{
- kstring_t str;
- str.l = h->l_txt? h->l_txt - 1 : 0;
- str.m = str.l + 1; str.s = h->txt;
- if (!strstr(str.s, "##INFO=<ID=DP,"))
- kputs("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=DP4,"))
- kputs("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=MQ,"))
- kputs("##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=FQ,"))
- kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=AF1,"))
- kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=AC1,"))
- kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=AN,"))
- kputs("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=IS,"))
- kputs("##INFO=<ID=IS,Number=2,Type=Float,Description=\"Maximum number of reads supporting an indel and fraction of indel reads\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=AC,"))
- kputs("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=G3,"))
- kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=HWE,"))
- kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=CLR,"))
- kputs("##INFO=<ID=CLR,Number=1,Type=Integer,Description=\"Log ratio of genotype likelihoods with and without the constraint\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=UGT,"))
- kputs("##INFO=<ID=UGT,Number=1,Type=String,Description=\"The most probable unconstrained genotype configuration in the trio\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=CGT,"))
- kputs("##INFO=<ID=CGT,Number=1,Type=String,Description=\"The most probable constrained genotype configuration in the trio\">\n", &str);
-// if (!strstr(str.s, "##INFO=<ID=CI95,"))
-// kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=PV4,"))
- kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=INDEL,"))
- kputs("##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=PC2,"))
- kputs("##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=PCHI2,"))
- kputs("##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
- kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=RP,"))
- kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=QBD,"))
- kputs("##INFO=<ID=QBD,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#reads\">\n", &str);
- //if (!strstr(str.s, "##INFO=<ID=RPS,"))
- // kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position Stats: depth, average, stddev\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=RPB,"))
- kputs("##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Read Position Bias\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=MDV,"))
- kputs("##INFO=<ID=MDV,Number=1,Type=Integer,Description=\"Maximum number of high-quality nonRef reads in samples\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=VDB,"))
- kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken.\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=GT,"))
- kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
- kputs("##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=GL,"))
- kputs("##FORMAT=<ID=GL,Number=3,Type=Float,Description=\"Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=DP,"))
- kputs("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"# high-quality bases\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=DV,"))
- kputs("##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality non-reference bases\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=SP,"))
- kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
- if (!strstr(str.s, "##FORMAT=<ID=PL,"))
- kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
- h->l_txt = str.l + 1; h->txt = str.s;
-}
-
-double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
-
-int bcfview(int argc, char *argv[])
-{
- extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b);
- extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x);
- extern int bcf_fix_gt(bcf1_t *b);
- extern int bcf_anno_max(bcf1_t *b);
- extern int bcf_shuffle(bcf1_t *b, int seed);
- extern uint32_t *bcf_trio_prep(int is_x, int is_son);
- extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt);
- extern int bcf_pair_call(const bcf1_t *b);
- extern int bcf_min_diff(const bcf1_t *b);
- extern int bcf_p1_get_M(bcf_p1aux_t *b);
-
- extern gzFile bcf_p1_fp_lk;
-
- bcf_t *bp, *bout = 0;
- bcf1_t *b, *blast;
- int c, *seeds = 0;
- uint64_t n_processed = 0, qcnt[256];
- viewconf_t vc;
- bcf_p1aux_t *p1 = 0;
- bcf_hdr_t *hin, *hout;
- int tid, begin, end;
- char moder[4], modew[4];
-
- tid = begin = end = -1;
- memset(&vc, 0, sizeof(viewconf_t));
- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; vc.min_ma_lrt = -1;
- memset(qcnt, 0, 8 * 256);
- while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) {
- switch (c) {
- case '1': vc.n1 = atoi(optarg); break;
- case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break;
- case 'D': vc.fn_dict = strdup(optarg); break;
- case 'F': vc.flag |= VC_FIX_PL; break;
- case 'N': vc.flag |= VC_ACGT_ONLY; break;
- case 'G': vc.flag |= VC_NO_GENO; break;
- case 'A': vc.flag |= VC_KEEPALT; break;
- case 'b': vc.flag |= VC_BCFOUT; break;
- case 'S': vc.flag |= VC_VCFIN; break;
- case 'c': vc.flag |= VC_CALL; break;
- case 'e': vc.flag |= VC_EM; break;
- case 'v': vc.flag |= VC_VARONLY | VC_CALL; break;
- case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break;
- case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
- case 'I': vc.flag |= VC_NO_INDEL; break;
- case 'w': vc.flag |= VC_INDEL_ONLY; break;
- case 'M': vc.flag |= VC_ANNO_MAX; break;
- case 'Y': vc.flag |= VC_QCNT; break;
- case 'm': vc.min_ma_lrt = atof(optarg); break;
- case 't': vc.theta = atof(optarg); break;
- case 'p': vc.pref = atof(optarg); break;
- case 'i': vc.indel_frac = atof(optarg); break;
- case 'Q': vc.flag |= VC_QCALL; break;
- case 'L': vc.flag |= VC_ADJLD; break;
- case 'U': vc.n_perm = atoi(optarg); break;
- case 'C': vc.min_lrt = atof(optarg); break;
- case 'X': vc.min_perm_p = atof(optarg); break;
- case 'd': vc.min_smpl_frac = atof(optarg); break;
- case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break;
- case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
- vc.ploidy = calloc(vc.n_sub + 1, 1);
- for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
- tid = -1;
- break;
- case 'T':
- if (strcmp(optarg, "trioauto") == 0) vc.trio_aux = bcf_trio_prep(0, 0);
- else if (strcmp(optarg, "trioxd") == 0) vc.trio_aux = bcf_trio_prep(1, 0);
- else if (strcmp(optarg, "trioxs") == 0) vc.trio_aux = bcf_trio_prep(1, 1);
- else if (strcmp(optarg, "pair") == 0) vc.flag |= VC_PAIRCALL;
- else {
- fprintf(stderr, "[%s] Option '-T' can only take value trioauto, trioxd or trioxs.\n", __func__);
- return 1;
- }
- break;
- case 'P':
- if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL;
- else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2;
- else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT;
- else vc.prior_file = strdup(optarg);
- break;
- }
- }
- if (argc == optind) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bcftools view [options] <in.bcf> [reg]\n\n");
- fprintf(stderr, "Input/output options:\n\n");
- fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
- fprintf(stderr, " -b output BCF instead of VCF\n");
- fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
- fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n");
- fprintf(stderr, " -G suppress all individual genotype information\n");
- fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n");
- fprintf(stderr, " -L calculate LD for adjacent sites\n");
- fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
- fprintf(stderr, " -Q output the QCALL likelihood format\n");
- fprintf(stderr, " -s FILE list of samples to use [all samples]\n");
- fprintf(stderr, " -S input is VCF\n");
- fprintf(stderr, " -u uncompressed BCF output (force -b)\n");
- fprintf(stderr, "\nConsensus/variant calling options:\n\n");
- fprintf(stderr, " -c SNP calling (force -e)\n");
- fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
- fprintf(stderr, " -e likelihood based analyses\n");
- fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
- fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
- fprintf(stderr, " -I skip indels\n");
- fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n");
- fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
- fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
- fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
- fprintf(stderr, " -T STR constrained calling; STR can be: pair, trioauto, trioxd and trioxs (see manual) [null]\n");
- fprintf(stderr, " -v output potential variant sites only (force -c)\n");
- fprintf(stderr, "\nContrast calling and association test options:\n\n");
- fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
- fprintf(stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", vc.min_lrt);
- fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
- fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
- fprintf(stderr, "\n");
- return 1;
- }
-
- if (vc.flag & VC_CALL) vc.flag |= VC_EM;
- if ((vc.flag & VC_VCFIN) && (vc.flag & VC_BCFOUT) && vc.fn_dict == 0) {
- fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__);
- return 1;
- }
- if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here!
- if (vc.n_perm > 0) {
- seeds = malloc(vc.n_perm * sizeof(int));
- srand48(time(0));
- for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48();
- }
- b = calloc(1, sizeof(bcf1_t));
- blast = calloc(1, sizeof(bcf1_t));
- strcpy(moder, "r");
- if (!(vc.flag & VC_VCFIN)) strcat(moder, "b");
- strcpy(modew, "w");
- if (vc.flag & VC_BCFOUT) strcat(modew, "b");
- if (vc.flag & VC_UNCOMP) strcat(modew, "u");
- bp = vcf_open(argv[optind], moder);
- hin = hout = vcf_hdr_read(bp);
- if (vc.fn_dict && (vc.flag & VC_VCFIN))
- vcf_dictread(bp, hin, vc.fn_dict);
- bout = vcf_open("-", modew);
- if (!(vc.flag & VC_QCALL)) {
- if (vc.n_sub) {
- vc.sublist = calloc(vc.n_sub, sizeof(int));
- hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist);
- }
- write_header(hout); // always print the header
- vcf_hdr_write(bout, hout);
- }
- if (vc.flag & VC_CALL) {
- p1 = bcf_p1_init(hout->n_smpl, vc.ploidy);
- if (vc.prior_file) {
- if (bcf_p1_read_prior(p1, vc.prior_file) < 0) {
- fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__);
- return 1;
- }
- } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta);
- if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1
- bcf_p1_set_n1(p1, vc.n1);
- bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
- }
- if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac
- }
- if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
- void *str2id = bcf_build_refhash(hout);
- if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) {
- bcf_idx_t *idx;
- idx = bcf_idx_load(argv[optind]);
- if (idx) {
- uint64_t off;
- off = bcf_idx_query(idx, tid, begin);
- if (off == 0) {
- fprintf(stderr, "[%s] no records in the query region.\n", __func__);
- return 1; // FIXME: a lot of memory leaks...
- }
- bgzf_seek(bp->fp, off, SEEK_SET);
- bcf_idx_destroy(idx);
- }
- }
- }
- if (bcf_p1_fp_lk && p1) {
- int32_t M = bcf_p1_get_M(p1);
- gzwrite(bcf_p1_fp_lk, &M, 4);
- }
- while (vcf_read(bp, hin, b) > 0) {
- int is_indel, cons_llr = -1;
- int64_t cons_gt = -1;
- double em[10];
- if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue;
- if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) {
- extern int bcf_smpl_covered(const bcf1_t *b);
- int n = bcf_smpl_covered(b);
- if ((double)n / b->n_smpl < vc.min_smpl_frac) continue;
- }
- if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b);
- if (vc.flag & VC_FIX_PL) bcf_fix_pl(b);
- is_indel = bcf_is_indel(b);
- if ((vc.flag & VC_NO_INDEL) && is_indel) continue;
- if ((vc.flag & VC_INDEL_ONLY) && !is_indel) continue;
- if ((vc.flag & VC_ACGT_ONLY) && !is_indel) {
- int x;
- if (b->ref[0] == 0 || b->ref[1] != 0) continue;
- x = toupper(b->ref[0]);
- if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue;
- }
- if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue;
- if (tid >= 0) {
- int l = strlen(b->ref);
- l = b->pos + (l > 0? l : 1);
- if (b->tid != tid || b->pos >= end) break;
- if (!(l > begin && end > b->pos)) continue;
- }
- ++n_processed;
- if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference
- int x = bcf_min_diff(b);
- if (x > 255) x = 255;
- if (x >= 0) ++qcnt[x];
- }
- if (vc.flag & VC_QCALL) { // output QCALL format; STOP here
- bcf_2qcall(hout, b);
- continue;
- }
- if (vc.trio_aux) // do trio calling
- bcf_trio_call(vc.trio_aux, b, &cons_llr, &cons_gt);
- else if (vc.flag & VC_PAIRCALL)
- cons_llr = bcf_pair_call(b);
- if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b);
- if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em);
- else {
- int i;
- for (i = 0; i < 9; ++i) em[i] = -1.;
- }
- if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 )
- {
- bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions
- int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY);
- if ( gts<=1 && vc.flag & VC_VARONLY ) continue;
- }
- else if (vc.flag & VC_CALL) { // call variants
- bcf_p1rst_t pr;
- int calret;
- gzwrite(bcf_p1_fp_lk, &b->tid, 4);
- gzwrite(bcf_p1_fp_lk, &b->pos, 4);
- gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double));
- calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
- if (n_processed % 100000 == 0) {
- fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
- bcf_p1_dump_afs(p1);
- }
- if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue;
- if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test
- bcf_p1rst_t r;
- int i, n = 0;
- for (i = 0; i < vc.n_perm; ++i) {
-#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts
- double x[10];
- bcf_shuffle(b, seeds[i]);
- bcf_em1(b, vc.n1, 1<<7, x);
- if (x[7] < em[7]) ++n;
-#else
- bcf_shuffle(b, seeds[i]);
- bcf_p1_cal(b, 1, p1, &r);
- if (pr.p_chi2 >= r.p_chi2) ++n;
-#endif
- }
- pr.perm_rank = n;
- }
- if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em, cons_llr, cons_gt);
- } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em, cons_llr, cons_gt);
- if (vc.flag & VC_ADJLD) { // compute LD
- double f[4], r2;
- if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) {
- kstring_t s;
- s.m = s.l = 0; s.s = 0;
- if (*b->info) kputc(';', &s);
- ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]);
- bcf_append_info(b, s.s, s.l);
- free(s.s);
- }
- bcf_cpy(blast, b);
- }
- if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b);
- if (vc.flag & VC_NO_GENO) { // do not output GENO fields
- b->n_gi = 0;
- b->fmt[0] = '\0';
- b->l_str = b->fmt - b->str + 1;
- } else bcf_fix_gt(b);
- vcf_write(bout, hout, b);
- }
-
- if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk);
- if (vc.prior_file) free(vc.prior_file);
- if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);
- if (hin != hout) bcf_hdr_destroy(hout);
- bcf_hdr_destroy(hin);
- bcf_destroy(b); bcf_destroy(blast);
- vcf_close(bp); vcf_close(bout);
- if (vc.fn_dict) free(vc.fn_dict);
- if (vc.ploidy) free(vc.ploidy);
- if (vc.trio_aux) free(vc.trio_aux);
- if (vc.n_sub) {
- int i;
- for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
- free(vc.subsam); free(vc.sublist);
- }
- if (vc.bed) bed_destroy(vc.bed);
- if (vc.flag & VC_QCNT)
- for (c = 0; c < 256; ++c)
- fprintf(stderr, "QT\t%d\t%lld\n", c, (long long)qcnt[c]);
- if (seeds) free(seeds);
- if (p1) bcf_p1_destroy(p1);
- return 0;
-}
diff --git a/sam/bcftools/em.c b/sam/bcftools/em.c
deleted file mode 100644
index b7dfe1a..0000000
--- a/sam/bcftools/em.c
+++ /dev/null
@@ -1,310 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "bcf.h"
-#include "kmin.h"
-
-static double g_q2p[256];
-
-#define ITER_MAX 50
-#define ITER_TRY 10
-#define EPS 1e-5
-
-extern double kf_gammaq(double, double);
-
-/*
- Generic routines
- */
-// get the 3 genotype likelihoods
-static double *get_pdg3(const bcf1_t *b)
-{
- double *pdg;
- const uint8_t *PL = 0;
- int i, PL_len = 0;
- // initialize g_q2p if necessary
- if (g_q2p[0] == 0.)
- for (i = 0; i < 256; ++i)
- g_q2p[i] = pow(10., -i / 10.);
- // set PL and PL_len
- for (i = 0; i < b->n_gi; ++i) {
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
- PL = (const uint8_t*)b->gi[i].data;
- PL_len = b->gi[i].len;
- break;
- }
- }
- if (i == b->n_gi) return 0; // no PL
- // fill pdg
- pdg = malloc(3 * b->n_smpl * sizeof(double));
- for (i = 0; i < b->n_smpl; ++i) {
- const uint8_t *pi = PL + i * PL_len;
- double *p = pdg + i * 3;
- p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
- }
- return pdg;
-}
-
-// estimate site allele frequency in a very naive and inaccurate way
-static double est_freq(int n, const double *pdg)
-{
- int i, gcnt[3], tmp1;
- // get a rough estimate of the genotype frequency
- gcnt[0] = gcnt[1] = gcnt[2] = 0;
- for (i = 0; i < n; ++i) {
- const double *p = pdg + i * 3;
- if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
- int which = p[0] > p[1]? 0 : 1;
- which = p[which] > p[2]? which : 2;
- ++gcnt[which];
- }
- }
- tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
- return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
-}
-
-/*
- Single-locus EM
- */
-
-typedef struct {
- int beg, end;
- const double *pdg;
-} minaux1_t;
-
-static double prob1(double f, void *data)
-{
- minaux1_t *a = (minaux1_t*)data;
- double p = 1., l = 0., f3[3];
- int i;
-// printf("brent %lg\n", f);
- if (f < 0 || f > 1) return 1e300;
- f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
- for (i = a->beg; i < a->end; ++i) {
- const double *pdg = a->pdg + i * 3;
- p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
- if (p < 1e-200) l -= log(p), p = 1.;
- }
- return l - log(p);
-}
-
-// one EM iteration for allele frequency estimate
-static double freq_iter(double *f, const double *_pdg, int beg, int end)
-{
- double f0 = *f, f3[3], err;
- int i;
-// printf("em %lg\n", *f);
- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
- for (i = beg, f0 = 0.; i < end; ++i) {
- const double *pdg = _pdg + i * 3;
- f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
- / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
- }
- f0 /= (end - beg) * 2;
- err = fabs(f0 - *f);
- *f = f0;
- return err;
-}
-
-/* The following function combines EM and Brent's method. When the signal from
- * the data is strong, EM is faster but sometimes, EM may converge very slowly.
- * When this happens, we switch to Brent's method. The idea is learned from
- * Rasmus Nielsen.
- */
-static double freqml(double f0, int beg, int end, const double *pdg)
-{
- int i;
- double f;
- for (i = 0, f = f0; i < ITER_TRY; ++i)
- if (freq_iter(&f, pdg, beg, end) < EPS) break;
- if (i == ITER_TRY) { // haven't converged yet; try Brent's method
- minaux1_t a;
- a.beg = beg; a.end = end; a.pdg = pdg;
- kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
- }
- return f;
-}
-
-// one EM iteration for genotype frequency estimate
-static double g3_iter(double g[3], const double *_pdg, int beg, int end)
-{
- double err, gg[3];
- int i;
- gg[0] = gg[1] = gg[2] = 0.;
-// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
- for (i = beg; i < end; ++i) {
- double sum, tmp[3];
- const double *pdg = _pdg + i * 3;
- tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
- sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
- gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
- }
- err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
- err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
- g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
- return err;
-}
-
-// perform likelihood ratio test
-static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
-{
- double r;
- int i;
- for (i = 0, r = 1.; i < n1; ++i) {
- const double *p = pdg + i * 3;
- r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
- / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
- }
- for (; i < n; ++i) {
- const double *p = pdg + i * 3;
- r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
- / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
- }
- return r;
-}
-
-// x[0]: ref frequency
-// x[1..3]: alt-alt, alt-ref, ref-ref frequenc
-// x[4]: HWE P-value
-// x[5..6]: group1 freq, group2 freq
-// x[7]: 1-degree P-value
-// x[8]: 2-degree P-value
-int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
-{
- double *pdg;
- int i, n, n2;
- if (b->n_alleles < 2) return -1; // one allele only
- // initialization
- if (n1 < 0 || n1 > b->n_smpl) n1 = 0;
- if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
- if (flag & 0xf<<1) flag |= 0xf<<1;
- n = b->n_smpl; n2 = n - n1;
- pdg = get_pdg3(b);
- if (pdg == 0) return -1;
- for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative
- {
- if ((x[0] = est_freq(n, pdg)) < 0.) {
- free(pdg);
- return -1; // no data
- }
- x[0] = freqml(x[0], 0, n, pdg);
- }
- if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE
- double *g = x + 1, f3[3], r;
- f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
- f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
- f3[2] = g[2] = x[0] * x[0];
- for (i = 0; i < ITER_MAX; ++i)
- if (g3_iter(g, pdg, 0, n) < EPS) break;
- // Hardy-Weinberg equilibrium (HWE)
- for (i = 0, r = 1.; i < n; ++i) {
- double *p = pdg + i * 3;
- r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
- }
- x[4] = kf_gammaq(.5, log(r));
- }
- if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
- x[5] = freqml(x[0], 0, n1, pdg);
- x[6] = freqml(x[0], n1, n, pdg);
- }
- if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
- double f[3], f3[3][3], tmp;
- f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
- for (i = 0; i < 3; ++i)
- f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
- tmp = log(lk_ratio_test(n, n1, pdg, f3));
- if (tmp < 0) tmp = 0;
- x[7] = kf_gammaq(.5, tmp);
- }
- if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
- double g[3][3], tmp;
- for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
- for (i = 0; i < ITER_MAX; ++i)
- if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
- for (i = 0; i < ITER_MAX; ++i)
- if (g3_iter(g[2], pdg, n1, n) < EPS) break;
- tmp = log(lk_ratio_test(n, n1, pdg, g));
- if (tmp < 0) tmp = 0;
- x[8] = kf_gammaq(1., tmp);
- }
- // free
- free(pdg);
- return 0;
-}
-
-/*
- Two-locus EM (LD)
- */
-
-#define _G1(h, k) ((h>>1&1) + (k>>1&1))
-#define _G2(h, k) ((h&1) + (k&1))
-
-// 0: the previous site; 1: the current site
-static int pair_freq_iter(int n, double *pdg[2], double f[4])
-{
- double ff[4];
- int i, k, h;
-// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
- memset(ff, 0, 4 * sizeof(double));
- for (i = 0; i < n; ++i) {
- double *p[2], sum, tmp;
- p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
- for (k = 0, sum = 0.; k < 4; ++k)
- for (h = 0; h < 4; ++h)
- sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
- for (k = 0; k < 4; ++k) {
- tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
- + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
- + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
- + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
- ff[k] += f[k] * tmp / sum;
- }
- }
- for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
- return 0;
-}
-
-double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
-{
- const bcf1_t *b[2];
- int i, j, n_smpl;
- double *pdg[2], flast[4], r, f0[2];
- // initialize others
- if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
- n_smpl = b0->n_smpl;
- b[0] = b0; b[1] = b1;
- f[0] = f[1] = f[2] = f[3] = -1.;
- if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
- pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1);
- if (pdg[0] == 0 || pdg[1] == 0) {
- free(pdg[0]); free(pdg[1]);
- return -1;
- }
- // set the initial value
- f0[0] = est_freq(n_smpl, pdg[0]);
- f0[1] = est_freq(n_smpl, pdg[1]);
- f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1];
- f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]);
- // iteration
- for (j = 0; j < ITER_MAX; ++j) {
- double eps = 0;
- memcpy(flast, f, 4 * sizeof(double));
- pair_freq_iter(n_smpl, pdg, f);
- for (i = 0; i < 4; ++i) {
- double x = fabs(f[i] - flast[i]);
- if (x > eps) eps = x;
- }
- if (eps < EPS) break;
- }
- // free
- free(pdg[0]); free(pdg[1]);
- { // calculate r^2
- double p[2], q[2], D;
- p[0] = f[0] + f[1]; q[0] = 1 - p[0];
- p[1] = f[0] + f[2]; q[1] = 1 - p[1];
- D = f[0] * f[3] - f[1] * f[2];
- r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
-// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r);
- if (isnan(r)) r = -1.;
- }
- return r;
-}
diff --git a/sam/bcftools/fet.c b/sam/bcftools/fet.c
deleted file mode 100644
index 5812517..0000000
--- a/sam/bcftools/fet.c
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <math.h>
-#include <stdlib.h>
-
-/* This program is implemented with ideas from this web page:
- *
- * http://www.langsrud.com/fisher.htm
- */
-
-// log\binom{n}{k}
-static double lbinom(int n, int k)
-{
- if (k == 0 || n == k) return 0;
- return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
-}
-
-// n11 n12 | n1_
-// n21 n22 | n2_
-//-----------+----
-// n_1 n_2 | n
-
-// hypergeometric distribution
-static double hypergeo(int n11, int n1_, int n_1, int n)
-{
- return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
-}
-
-typedef struct {
- int n11, n1_, n_1, n;
- double p;
-} hgacc_t;
-
-// incremental version of hypergenometric distribution
-static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
-{
- if (n1_ || n_1 || n) {
- aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
- } else { // then only n11 changed; the rest fixed
- if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
- if (n11 == aux->n11 + 1) { // incremental
- aux->p *= (double)(aux->n1_ - aux->n11) / n11
- * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
- aux->n11 = n11;
- return aux->p;
- }
- if (n11 == aux->n11 - 1) { // incremental
- aux->p *= (double)aux->n11 / (aux->n1_ - n11)
- * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
- aux->n11 = n11;
- return aux->p;
- }
- }
- aux->n11 = n11;
- }
- aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
- return aux->p;
-}
-
-double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
-{
- int i, j, max, min;
- double p, q, left, right;
- hgacc_t aux;
- int n1_, n_1, n;
-
- n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
- max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
- min = n1_ + n_1 - n;
- if (min < 0) min = 0; // min n11, for left tail
- *two = *_left = *_right = 1.;
- if (min == max) return 1.; // no need to do test
- q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
- // left tail
- p = hypergeo_acc(min, 0, 0, 0, &aux);
- for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow
- left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
- --i;
- if (p < 1.00000001 * q) left += p;
- else --i;
- // right tail
- p = hypergeo_acc(max, 0, 0, 0, &aux);
- for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow
- right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
- ++j;
- if (p < 1.00000001 * q) right += p;
- else ++j;
- // two-tail
- *two = left + right;
- if (*two > 1.) *two = 1.;
- // adjust left and right
- if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
- else left = 1.0 - right + q;
- *_left = left; *_right = right;
- return q;
-}
-
-#ifdef FET_MAIN
-#include <stdio.h>
-
-int main(int argc, char *argv[])
-{
- char id[1024];
- int n11, n12, n21, n22;
- double left, right, twotail, prob;
-
- while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) {
- prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail);
- printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22,
- prob, left, right, twotail);
- }
- return 0;
-}
-#endif
diff --git a/sam/bcftools/index.c b/sam/bcftools/index.c
deleted file mode 100644
index a7db24f..0000000
--- a/sam/bcftools/index.c
+++ /dev/null
@@ -1,336 +0,0 @@
-#include <assert.h>
-#include <ctype.h>
-#include <sys/stat.h>
-#include "bam_endian.h"
-#include "kstring.h"
-#include "bcf.h"
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-
-#define TAD_LIDX_SHIFT 13
-
-typedef struct {
- int32_t n, m;
- uint64_t *offset;
-} bcf_lidx_t;
-
-struct __bcf_idx_t {
- int32_t n;
- bcf_lidx_t *index2;
-};
-
-/************
- * indexing *
- ************/
-
-static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset)
-{
- int i, beg, end;
- beg = _beg >> TAD_LIDX_SHIFT;
- end = (_end - 1) >> TAD_LIDX_SHIFT;
- if (index2->m < end + 1) {
- int old_m = index2->m;
- index2->m = end + 1;
- kroundup32(index2->m);
- index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
- memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
- }
- if (beg == end) {
- if (index2->offset[beg] == 0) index2->offset[beg] = offset;
- } else {
- for (i = beg; i <= end; ++i)
- if (index2->offset[i] == 0) index2->offset[i] = offset;
- }
- if (index2->n < end + 1) index2->n = end + 1;
-}
-
-bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h)
-{
- bcf_idx_t *idx;
- int32_t last_coor, last_tid;
- uint64_t last_off;
- kstring_t *str;
- BGZF *fp = bp->fp;
- bcf1_t *b;
- int ret;
-
- b = calloc(1, sizeof(bcf1_t));
- str = calloc(1, sizeof(kstring_t));
- idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
- idx->n = h->n_ref;
- idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t));
-
- last_tid = 0xffffffffu;
- last_off = bgzf_tell(fp); last_coor = 0xffffffffu;
- while ((ret = bcf_read(bp, h, b)) > 0) {
- int end, tmp;
- if (last_tid != b->tid) { // change of chromosomes
- last_tid = b->tid;
- } else if (last_coor > b->pos) {
- fprintf(stderr, "[bcf_idx_core] the input is out of order\n");
- free(str->s); free(str); free(idx); bcf_destroy(b);
- return 0;
- }
- tmp = strlen(b->ref);
- end = b->pos + (tmp > 0? tmp : 1);
- insert_offset2(&idx->index2[b->tid], b->pos, end, last_off);
- last_off = bgzf_tell(fp);
- last_coor = b->pos;
- }
- free(str->s); free(str); bcf_destroy(b);
- return idx;
-}
-
-void bcf_idx_destroy(bcf_idx_t *idx)
-{
- int i;
- if (idx == 0) return;
- for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset);
- free(idx->index2);
- free(idx);
-}
-
-/******************
- * index file I/O *
- ******************/
-
-void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp)
-{
- int32_t i, ti_is_be;
- ti_is_be = bam_is_big_endian();
- bgzf_write(fp, "BCI\4", 4);
- if (ti_is_be) {
- uint32_t x = idx->n;
- bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- } else bgzf_write(fp, &idx->n, 4);
- for (i = 0; i < idx->n; ++i) {
- bcf_lidx_t *index2 = idx->index2 + i;
- // write linear index (index2)
- if (ti_is_be) {
- int x = index2->n;
- bgzf_write(fp, bam_swap_endian_4p(&x), 4);
- } else bgzf_write(fp, &index2->n, 4);
- if (ti_is_be) { // big endian
- int x;
- for (x = 0; (int)x < index2->n; ++x)
- bam_swap_endian_8p(&index2->offset[x]);
- bgzf_write(fp, index2->offset, 8 * index2->n);
- for (x = 0; (int)x < index2->n; ++x)
- bam_swap_endian_8p(&index2->offset[x]);
- } else bgzf_write(fp, index2->offset, 8 * index2->n);
- }
-}
-
-static bcf_idx_t *bcf_idx_load_core(BGZF *fp)
-{
- int i, ti_is_be;
- char magic[4];
- bcf_idx_t *idx;
- ti_is_be = bam_is_big_endian();
- if (fp == 0) {
- fprintf(stderr, "[%s] fail to load index.\n", __func__);
- return 0;
- }
- bgzf_read(fp, magic, 4);
- if (strncmp(magic, "BCI\4", 4)) {
- fprintf(stderr, "[%s] wrong magic number.\n", __func__);
- return 0;
- }
- idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
- bgzf_read(fp, &idx->n, 4);
- if (ti_is_be) bam_swap_endian_4p(&idx->n);
- idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t));
- for (i = 0; i < idx->n; ++i) {
- bcf_lidx_t *index2 = idx->index2 + i;
- int j;
- bgzf_read(fp, &index2->n, 4);
- if (ti_is_be) bam_swap_endian_4p(&index2->n);
- index2->m = index2->n;
- index2->offset = (uint64_t*)calloc(index2->m, 8);
- bgzf_read(fp, index2->offset, index2->n * 8);
- if (ti_is_be)
- for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
- }
- return idx;
-}
-
-bcf_idx_t *bcf_idx_load_local(const char *fnidx)
-{
- BGZF *fp;
- fp = bgzf_open(fnidx, "r");
- if (fp) {
- bcf_idx_t *idx = bcf_idx_load_core(fp);
- bgzf_close(fp);
- return idx;
- } else return 0;
-}
-
-#ifdef _USE_KNETFILE
-static void download_from_remote(const char *url)
-{
- const int buf_size = 1 * 1024 * 1024;
- char *fn;
- FILE *fp;
- uint8_t *buf;
- knetFile *fp_remote;
- int l;
- if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
- l = strlen(url);
- for (fn = (char*)url + l - 1; fn >= url; --fn)
- if (*fn == '/') break;
- ++fn; // fn now points to the file name
- fp_remote = knet_open(url, "r");
- if (fp_remote == 0) {
- fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
- return;
- }
- if ((fp = fopen(fn, "w")) == 0) {
- fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
- knet_close(fp_remote);
- return;
- }
- buf = (uint8_t*)calloc(buf_size, 1);
- while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
- fwrite(buf, 1, l, fp);
- free(buf);
- fclose(fp);
- knet_close(fp_remote);
-}
-#else
-static void download_from_remote(const char *url)
-{
- return;
-}
-#endif
-
-static char *get_local_version(const char *fn)
-{
- struct stat sbuf;
- char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcat(strcpy(fnidx, fn), ".bci");
- if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
- char *p, *url;
- int l = strlen(fnidx);
- for (p = fnidx + l - 1; p >= fnidx; --p)
- if (*p == '/') break;
- url = fnidx; fnidx = strdup(p + 1);
- if (stat(fnidx, &sbuf) == 0) {
- free(url);
- return fnidx;
- }
- fprintf(stderr, "[%s] downloading the index file...\n", __func__);
- download_from_remote(url);
- free(url);
- }
- if (stat(fnidx, &sbuf) == 0) return fnidx;
- free(fnidx); return 0;
-}
-
-bcf_idx_t *bcf_idx_load(const char *fn)
-{
- bcf_idx_t *idx;
- char *fname = get_local_version(fn);
- if (fname == 0) return 0;
- idx = bcf_idx_load_local(fname);
- free(fname);
- return idx;
-}
-
-int bcf_idx_build2(const char *fn, const char *_fnidx)
-{
- char *fnidx;
- BGZF *fpidx;
- bcf_t *bp;
- bcf_idx_t *idx;
- bcf_hdr_t *h;
- if ((bp = bcf_open(fn, "r")) == 0) {
- fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n");
- return -1;
- }
- h = bcf_hdr_read(bp);
- idx = bcf_idx_core(bp, h);
- bcf_close(bp);
- if (_fnidx == 0) {
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".bci");
- } else fnidx = strdup(_fnidx);
- fpidx = bgzf_open(fnidx, "w");
- if (fpidx == 0) {
- fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n");
- free(fnidx);
- bcf_idx_destroy(idx);
- return -1;
- }
- bcf_idx_save(idx, fpidx);
- bcf_idx_destroy(idx);
- bgzf_close(fpidx);
- free(fnidx);
- return 0;
-}
-
-int bcf_idx_build(const char *fn)
-{
- return bcf_idx_build2(fn, 0);
-}
-
-/********************************************
- * parse a region in the format chr:beg-end *
- ********************************************/
-
-int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end)
-{
- char *s, *p;
- int i, l, k;
- l = strlen(str);
- p = s = (char*)malloc(l+1);
- /* squeeze out "," */
- for (i = k = 0; i != l; ++i)
- if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
- s[k] = 0;
- for (i = 0; i != k; ++i) if (s[i] == ':') break;
- s[i] = 0;
- if ((*tid = bcf_str2id(str2id, s)) < 0) {
- free(s);
- return -1;
- }
- if (i == k) { /* dump the whole sequence */
- *begin = 0; *end = 1<<29; free(s);
- return 0;
- }
- for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
- *begin = atoi(p);
- if (i < k) {
- p = s + i + 1;
- *end = atoi(p);
- } else *end = 1<<29;
- if (*begin > 0) --*begin;
- free(s);
- if (*begin > *end) return -1;
- return 0;
-}
-
-/*******************************
- * retrieve a specified region *
- *******************************/
-
-uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg)
-{
- uint64_t min_off, *offset;
- int i;
- if (beg < 0) beg = 0;
- offset = idx->index2[tid].offset;
- for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i);
- min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i];
- return min_off;
-}
-
-int bcf_main_index(int argc, char *argv[])
-{
- if (argc == 1) {
- fprintf(stderr, "Usage: bcftools index <in.bcf>\n");
- return 1;
- }
- bcf_idx_build(argv[1]);
- return 0;
-}
diff --git a/sam/bcftools/kfunc.c b/sam/bcftools/kfunc.c
deleted file mode 100644
index a637b6c..0000000
--- a/sam/bcftools/kfunc.c
+++ /dev/null
@@ -1,162 +0,0 @@
-#include <math.h>
-
-
-/* Log gamma function
- * \log{\Gamma(z)}
- * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
- */
-double kf_lgamma(double z)
-{
- double x = 0;
- x += 0.1659470187408462e-06 / (z+7);
- x += 0.9934937113930748e-05 / (z+6);
- x -= 0.1385710331296526 / (z+5);
- x += 12.50734324009056 / (z+4);
- x -= 176.6150291498386 / (z+3);
- x += 771.3234287757674 / (z+2);
- x -= 1259.139216722289 / (z+1);
- x += 676.5203681218835 / z;
- x += 0.9999999999995183;
- return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
-}
-
-/* complementary error function
- * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
- * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
- */
-double kf_erfc(double x)
-{
- const double p0 = 220.2068679123761;
- const double p1 = 221.2135961699311;
- const double p2 = 112.0792914978709;
- const double p3 = 33.912866078383;
- const double p4 = 6.37396220353165;
- const double p5 = .7003830644436881;
- const double p6 = .03526249659989109;
- const double q0 = 440.4137358247522;
- const double q1 = 793.8265125199484;
- const double q2 = 637.3336333788311;
- const double q3 = 296.5642487796737;
- const double q4 = 86.78073220294608;
- const double q5 = 16.06417757920695;
- const double q6 = 1.755667163182642;
- const double q7 = .08838834764831844;
- double expntl, z, p;
- z = fabs(x) * M_SQRT2;
- if (z > 37.) return x > 0.? 0. : 2.;
- expntl = exp(z * z * - .5);
- if (z < 10. / M_SQRT2) // for small z
- p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
- / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
- else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
- return x > 0.? 2. * p : 2. * (1. - p);
-}
-
-/* The following computes regularized incomplete gamma functions.
- * Formulas are taken from Wiki, with additional input from Numerical
- * Recipes in C (for modified Lentz's algorithm) and AS245
- * (http://lib.stat.cmu.edu/apstat/245).
- *
- * A good online calculator is available at:
- *
- * http://www.danielsoper.com/statcalc/calc23.aspx
- *
- * It calculates upper incomplete gamma function, which equals
- * kf_gammaq(s,z)*tgamma(s).
- */
-
-#define KF_GAMMA_EPS 1e-14
-#define KF_TINY 1e-290
-
-// regularized lower incomplete gamma function, by series expansion
-static double _kf_gammap(double s, double z)
-{
- double sum, x;
- int k;
- for (k = 1, sum = x = 1.; k < 100; ++k) {
- sum += (x *= z / (s + k));
- if (x / sum < KF_GAMMA_EPS) break;
- }
- return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
-}
-// regularized upper incomplete gamma function, by continued fraction
-static double _kf_gammaq(double s, double z)
-{
- int j;
- double C, D, f;
- f = 1. + z - s; C = f; D = 0.;
- // Modified Lentz's algorithm for computing continued fraction
- // See Numerical Recipes in C, 2nd edition, section 5.2
- for (j = 1; j < 100; ++j) {
- double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
- D = b + a * D;
- if (D < KF_TINY) D = KF_TINY;
- C = b + a / C;
- if (C < KF_TINY) C = KF_TINY;
- D = 1. / D;
- d = C * D;
- f *= d;
- if (fabs(d - 1.) < KF_GAMMA_EPS) break;
- }
- return exp(s * log(z) - z - kf_lgamma(s) - log(f));
-}
-
-double kf_gammap(double s, double z)
-{
- return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
-}
-
-double kf_gammaq(double s, double z)
-{
- return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
-}
-
-/* Regularized incomplete beta function. The method is taken from
- * Numerical Recipe in C, 2nd edition, section 6.4. The following web
- * page calculates the incomplete beta function, which equals
- * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
- *
- * http://www.danielsoper.com/statcalc/calc36.aspx
- */
-static double kf_betai_aux(double a, double b, double x)
-{
- double C, D, f;
- int j;
- if (x == 0.) return 0.;
- if (x == 1.) return 1.;
- f = 1.; C = f; D = 0.;
- // Modified Lentz's algorithm for computing continued fraction
- for (j = 1; j < 200; ++j) {
- double aa, d;
- int m = j>>1;
- aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
- : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
- D = 1. + aa * D;
- if (D < KF_TINY) D = KF_TINY;
- C = 1. + aa / C;
- if (C < KF_TINY) C = KF_TINY;
- D = 1. / D;
- d = C * D;
- f *= d;
- if (fabs(d - 1.) < KF_GAMMA_EPS) break;
- }
- return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
-}
-double kf_betai(double a, double b, double x)
-{
- return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
-}
-
-#ifdef KF_MAIN
-#include <stdio.h>
-int main(int argc, char *argv[])
-{
- double x = 5.5, y = 3;
- double a, b;
- printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
- printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
- a = 2; b = 2; x = 0.5;
- printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
- return 0;
-}
-#endif
diff --git a/sam/bcftools/kmin.c b/sam/bcftools/kmin.c
deleted file mode 100644
index 5b8193b..0000000
--- a/sam/bcftools/kmin.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008, 2010 by Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Hooke-Jeeves algorithm for nonlinear minimization
-
- Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
- the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
- papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
- 6(6):313-314). The original algorithm was designed by Hooke and
- Jeeves (ACM 8:212-229). This program is further revised according to
- Johnson's implementation at Netlib (opt/hooke.c).
-
- Hooke-Jeeves algorithm is very simple and it works quite well on a
- few examples. However, it might fail to converge due to its heuristic
- nature. A possible improvement, as is suggested by Johnson, may be to
- choose a small r at the beginning to quickly approach to the minimum
- and a large r at later step to hit the minimum.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "kmin.h"
-
-static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
-{
- int k, j = *n_calls;
- double ftmp;
- for (k = 0; k != n; ++k) {
- x1[k] += dx[k];
- ftmp = func(n, x1, data); ++j;
- if (ftmp < fx1) fx1 = ftmp;
- else { /* search the opposite direction */
- dx[k] = 0.0 - dx[k];
- x1[k] += dx[k] + dx[k];
- ftmp = func(n, x1, data); ++j;
- if (ftmp < fx1) fx1 = ftmp;
- else x1[k] -= dx[k]; /* back to the original x[k] */
- }
- }
- *n_calls = j;
- return fx1; /* here: fx1=f(n,x1) */
-}
-
-double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
-{
- double fx, fx1, *x1, *dx, radius;
- int k, n_calls = 0;
- x1 = (double*)calloc(n, sizeof(double));
- dx = (double*)calloc(n, sizeof(double));
- for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
- dx[k] = fabs(x[k]) * r;
- if (dx[k] == 0) dx[k] = r;
- }
- radius = r;
- fx1 = fx = func(n, x, data); ++n_calls;
- for (;;) {
- memcpy(x1, x, n * sizeof(double)); /* x1 = x */
- fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
- while (fx1 < fx) {
- for (k = 0; k != n; ++k) {
- double t = x[k];
- dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
- x[k] = x1[k];
- x1[k] = x1[k] + x1[k] - t;
- }
- fx = fx1;
- if (n_calls >= max_calls) break;
- fx1 = func(n, x1, data); ++n_calls;
- fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
- if (fx1 >= fx) break;
- for (k = 0; k != n; ++k)
- if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
- if (k == n) break;
- }
- if (radius >= eps) {
- if (n_calls >= max_calls) break;
- radius *= r;
- for (k = 0; k != n; ++k) dx[k] *= r;
- } else break; /* converge */
- }
- free(x1); free(dx);
- return fx1;
-}
-
-// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
-double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
-{
- double bound, u, r, q, fu, tmp, fa, fb, fc, c;
- const double gold1 = 1.6180339887;
- const double gold2 = 0.3819660113;
- const double tiny = 1e-20;
- const int max_iter = 100;
-
- double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
- int iter;
-
- fa = func(a, data); fb = func(b, data);
- if (fb > fa) { // swap, such that f(a) > f(b)
- tmp = a; a = b; b = tmp;
- tmp = fa; fa = fb; fb = tmp;
- }
- c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
- while (fb > fc) {
- bound = b + 100.0 * (c - b); // the farthest point where we want to go
- r = (b - a) * (fb - fc);
- q = (b - c) * (fb - fa);
- if (fabs(q - r) < tiny) { // avoid 0 denominator
- tmp = q > r? tiny : 0.0 - tiny;
- } else tmp = q - r;
- u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
- if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
- fu = func(u, data);
- if (fu < fc) { // (b,u,c) bracket the minimum
- a = b; b = u; fa = fb; fb = fu;
- break;
- } else if (fu > fb) { // (a,b,u) bracket the minimum
- c = u; fc = fu;
- break;
- }
- u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
- } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
- fu = func(u, data);
- if (fu < fc) { // fb > fc > fu
- b = c; c = u; u = c + gold1 * (c - b);
- fb = fc; fc = fu; fu = func(u, data);
- } else { // (b,c,u) bracket the minimum
- a = b; b = c; c = u;
- fa = fb; fb = fc; fc = fu;
- break;
- }
- } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
- u = bound; fu = func(u, data);
- } else { // u goes the other way around, use golden section extrapolation
- u = c + gold1 * (c - b); fu = func(u, data);
- }
- a = b; b = c; c = u;
- fa = fb; fb = fc; fc = fu;
- }
- if (a > c) u = a, a = c, c = u; // swap
-
- // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
- e = d = 0.0;
- w = v = b; fv = fw = fb;
- for (iter = 0; iter != max_iter; ++iter) {
- mid = 0.5 * (a + c);
- tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
- if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
- *xmin = b; return fb; // found
- }
- if (fabs(e) > tol1) {
- // related to parabolic interpolation
- r = (b - w) * (fb - fv);
- q = (b - v) * (fb - fw);
- p = (b - v) * q - (b - w) * r;
- q = 2.0 * (q - r);
- if (q > 0.0) p = 0.0 - p;
- else q = 0.0 - q;
- eold = e; e = d;
- if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
- d = gold2 * (e = (b >= mid ? a - b : c - b));
- } else {
- d = p / q; u = b + d; // actual parabolic interpolation happens here
- if (u - a < tol2 || c - u < tol2)
- d = (mid > b)? tol1 : 0.0 - tol1;
- }
- } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
- u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
- fu = func(u, data);
- if (fu <= fb) { // u is the minimum point so far
- if (u >= b) a = b;
- else c = b;
- v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
- } else { // adjust (a,c) and (u,v,w)
- if (u < b) a = u;
- else c = u;
- if (fu <= fw || w == b) {
- v = w; w = u;
- fv = fw; fw = fu;
- } else if (fu <= fv || v == b || v == w) {
- v = u; fv = fu;
- }
- }
- }
- *xmin = b;
- return fb;
-}
diff --git a/sam/bcftools/kmin.h b/sam/bcftools/kmin.h
deleted file mode 100644
index 6feba45..0000000
--- a/sam/bcftools/kmin.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- Copyright (c) 2008, 2010 by Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef KMIN_H
-#define KMIN_H
-
-#define KMIN_RADIUS 0.5
-#define KMIN_EPS 1e-7
-#define KMIN_MAXCALL 50000
-
-typedef double (*kmin_f)(int, double*, void*);
-typedef double (*kmin1_f)(double, void*);
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls);
- double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/bcftools/main.c b/sam/bcftools/main.c
deleted file mode 100644
index eda6217..0000000
--- a/sam/bcftools/main.c
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "knetfile.h"
-#include "bcf.h"
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 0x10000)
-
-int bcfview(int argc, char *argv[]);
-int bcf_main_index(int argc, char *argv[]);
-
-#define BUF_SIZE 0x10000
-
-int bcf_cat(int n, char * const *fn)
-{
- int i;
- bcf_t *out;
- uint8_t *buf;
- buf = malloc(BUF_SIZE);
- out = bcf_open("-", "w");
- for (i = 0; i < n; ++i) {
- bcf_t *in;
- bcf_hdr_t *h;
- off_t end;
- struct stat s;
- in = bcf_open(fn[i], "r");
- h = bcf_hdr_read(in);
- if (i == 0) bcf_hdr_write(out, h);
- bcf_hdr_destroy(h);
-#ifdef _USE_KNETFILE
- fstat(knet_fileno((knetFile*)in->fp->fp), &s);
- end = s.st_size - 28;
- while (knet_tell((knetFile*)in->fp->fp) < end) {
- int size = knet_tell((knetFile*)in->fp->fp) + BUF_SIZE < end? BUF_SIZE : end - knet_tell((knetFile*)in->fp->fp);
- knet_read(in->fp->fp, buf, size);
- fwrite(buf, 1, size, out->fp->fp);
- }
-#else
- abort(); // FIXME: not implemented
-#endif
- bcf_close(in);
- }
- bcf_close(out);
- free(buf);
- return 0;
-}
-
-extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
-
-int bcf_main_ldpair(int argc, char *argv[])
-{
- bcf_t *fp;
- bcf_hdr_t *h;
- bcf1_t *b0, *b1;
- bcf_idx_t *idx;
- kstring_t str;
- void *str2id;
- gzFile fplist;
- kstream_t *ks;
- int dret, lineno = 0;
- if (argc < 3) {
- fprintf(stderr, "Usage: bcftools ldpair <in.bcf> <in.list>\n");
- return 1;
- }
- fplist = gzopen(argv[2], "rb");
- ks = ks_init(fplist);
- memset(&str, 0, sizeof(kstring_t));
- fp = bcf_open(argv[1], "rb");
- h = bcf_hdr_read(fp);
- str2id = bcf_build_refhash(h);
- idx = bcf_idx_load(argv[1]);
- if (idx == 0) {
- fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__);
- return 1;
- }
- b0 = calloc(1, sizeof(bcf1_t));
- b1 = calloc(1, sizeof(bcf1_t));
- while (ks_getuntil(ks, '\n', &str, &dret) >= 0) {
- char *p, *q;
- int k;
- int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1;
- ++lineno;
- for (p = q = str.s, k = 0; *p; ++p) {
- if (*p == ' ' || *p == '\t') {
- *p = '\0';
- if (k == 0) tid0 = bcf_str2id(str2id, q);
- else if (k == 1) pos0 = atoi(q) - 1;
- else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0;
- else if (k == 3) pos1 = atoi(q) - 1;
- q = p + 1;
- ++k;
- }
- }
- if (k == 3) pos1 = atoi(q) - 1;
- if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) {
- uint64_t off;
- double r, f[4];
- off = bcf_idx_query(idx, tid0, pos0);
- bgzf_seek(fp->fp, off, SEEK_SET);
- while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0);
- off = bcf_idx_query(idx, tid1, pos1);
- bgzf_seek(fp->fp, off, SEEK_SET);
- while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1);
- r = bcf_pair_freq(b0, b1, f);
- r *= r;
- printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1,
- r, f[0], f[1], f[2], f[3]);
- } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno);
- }
- bcf_destroy(b0); bcf_destroy(b1);
- bcf_idx_destroy(idx);
- bcf_str2id_destroy(str2id);
- bcf_hdr_destroy(h);
- bcf_close(fp);
- free(str.s);
- ks_destroy(ks);
- gzclose(fplist);
- return 0;
-}
-
-int bcf_main_ld(int argc, char *argv[])
-{
- bcf_t *fp;
- bcf_hdr_t *h;
- bcf1_t **b, *b0;
- int i, j, m, n;
- double f[4];
- if (argc == 1) {
- fprintf(stderr, "Usage: bcftools ld <in.bcf>\n");
- return 1;
- }
- fp = bcf_open(argv[1], "rb");
- h = bcf_hdr_read(fp);
- // read the entire BCF
- m = n = 0; b = 0;
- b0 = calloc(1, sizeof(bcf1_t));
- while (bcf_read(fp, h, b0) >= 0) {
- if (m == n) {
- m = m? m<<1 : 16;
- b = realloc(b, sizeof(void*) * m);
- }
- b[n] = calloc(1, sizeof(bcf1_t));
- bcf_cpy(b[n++], b0);
- }
- bcf_destroy(b0);
- // compute pair-wise r^2
- printf("%d\n", n); // the number of loci
- for (i = 0; i < n; ++i) {
- printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1);
- for (j = 0; j < i; ++j) {
- double r = bcf_pair_freq(b[i], b[j], f);
- printf("\t%.3f", r*r);
- }
- printf("\t1.000\n");
- }
- // free
- for (i = 0; i < n; ++i) bcf_destroy(b[i]);
- free(b);
- bcf_hdr_destroy(h);
- bcf_close(fp);
- return 0;
-}
-
-int main(int argc, char *argv[])
-{
- if (argc == 1) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n");
- fprintf(stderr, "Version: %s\n\n", BCF_VERSION);
- fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n");
- fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n");
- fprintf(stderr, " index index BCF\n");
- fprintf(stderr, " cat concatenate BCFs\n");
- fprintf(stderr, " ld compute all-pair r^2\n");
- fprintf(stderr, " ldpair compute r^2 between requested pairs\n");
- fprintf(stderr, "\n");
- return 1;
- }
- if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1);
- else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1);
- else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1);
- else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1);
- else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ...
- else {
- fprintf(stderr, "[main] Unrecognized command.\n");
- return 1;
- }
- return 0;
-}
diff --git a/sam/bcftools/mut.c b/sam/bcftools/mut.c
deleted file mode 100644
index 15ef265..0000000
--- a/sam/bcftools/mut.c
+++ /dev/null
@@ -1,127 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include "bcf.h"
-
-#define MAX_GENO 359
-
-int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
-char *seq_nt16rev = "XACMGRSVTWYHKDBN";
-
-uint32_t *bcf_trio_prep(int is_x, int is_son)
-{
- int i, j, k, n, map[10];
- uint32_t *ret;
- ret = calloc(MAX_GENO, 4);
- for (i = 0, k = 0; i < 4; ++i)
- for (j = i; j < 4; ++j)
- map[k++] = 1<<i|1<<j;
- for (i = 0, n = 1; i < 10; ++i) { // father
- if (is_x && seq_bitcnt[map[i]] != 1) continue;
- if (is_x && is_son) {
- for (j = 0; j < 10; ++j) // mother
- for (k = 0; k < 10; ++k) // child
- if (seq_bitcnt[map[k]] == 1 && (map[j]&map[k]))
- ret[n++] = j<<16 | i<<8 | k;
- } else {
- for (j = 0; j < 10; ++j) // mother
- for (k = 0; k < 10; ++k) // child
- if ((map[i]&map[k]) && (map[j]&map[k]) && ((map[i]|map[j])&map[k]) == map[k])
- ret[n++] = j<<16 | i<<8 | k;
- }
- }
- ret[0] = n - 1;
- return ret;
-}
-
-
-int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt)
-{
- int i, j, k;
- const bcf_ginfo_t *PL;
- uint8_t *gl10;
- int map[10];
- if (b->n_smpl != 3) return -1; // not a trio
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
- if (i == b->n_gi) return -1; // no PL
- gl10 = alloca(10 * b->n_smpl);
- if (bcf_gl10(b, gl10) < 0) {
- if (bcf_gl10_indel(b, gl10) < 0) return -1;
- }
- PL = b->gi + i;
- for (i = 0, k = 0; i < 4; ++i)
- for (j = i; j < 4; ++j)
- map[k++] = seq_nt16rev[1<<i|1<<j];
- for (j = 0; j < 3; ++j) // check if ref hom is the most probable in all members
- if (((uint8_t*)PL->data)[j * PL->len] != 0) break;
- if (j < 3) { // we need to go through the complex procedure
- uint8_t *g[3];
- int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0;
- g[0] = gl10;
- g[1] = gl10 + 10;
- g[2] = gl10 + 20;
- for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint
- int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff];
- if (sum < minc) minc = sum, minc_j = j;
- }
- gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16;
- for (j = 0; j < 3; ++j) { // compute LK without constraint
- int min = 1<<30, min_k = -1;
- for (k = 0; k < 10; ++k)
- if (g[j][k] < min) min = g[j][k], min_k = k;
- gtf |= map[min_k]<<(j*8);
- minf += min;
- }
- *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf;
- } else *llr = 0, *gt = -1;
- return 0;
-}
-
-int bcf_pair_call(const bcf1_t *b)
-{
- int i, j, k;
- const bcf_ginfo_t *PL;
- if (b->n_smpl != 2) return -1; // not a pair
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
- if (i == b->n_gi) return -1; // no PL
- PL = b->gi + i;
- for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members
- if (((uint8_t*)PL->data)[j * PL->len] != 0) break;
- if (j < 2) { // we need to go through the complex procedure
- uint8_t *g[2];
- int minc = 1<<30, minf = 0;
- g[0] = PL->data;
- g[1] = (uint8_t*)PL->data + PL->len;
- for (j = 0; j < PL->len; ++j) // compute LK with constraint
- minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j];
- for (j = 0; j < 2; ++j) { // compute LK without constraint
- int min = 1<<30;
- for (k = 0; k < PL->len; ++k)
- min = min < g[j][k]? min : g[j][k];
- minf += min;
- }
- return minc - minf;
- } else return 0;
-}
-
-int bcf_min_diff(const bcf1_t *b)
-{
- int i, min = 1<<30;
- const bcf_ginfo_t *PL;
- for (i = 0; i < b->n_gi; ++i)
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
- if (i == b->n_gi) return -1; // no PL
- PL = b->gi + i;
- for (i = 0; i < b->n_smpl; ++i) {
- int m1, m2, j;
- const uint8_t *p = (uint8_t*)PL->data;
- m1 = m2 = 1<<30;
- for (j = 0; j < PL->len; ++j) {
- if ((int)p[j] < m1) m2 = m1, m1 = p[j];
- else if ((int)p[j] < m2) m2 = p[j];
- }
- min = min < m2 - m1? min : m2 - m1;
- }
- return min;
-}
diff --git a/sam/bcftools/prob1.c b/sam/bcftools/prob1.c
deleted file mode 100644
index 3539ee3..0000000
--- a/sam/bcftools/prob1.c
+++ /dev/null
@@ -1,988 +0,0 @@
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <errno.h>
-#include <assert.h>
-#include <limits.h>
-#include <zlib.h>
-#include "prob1.h"
-#include "kstring.h"
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 16384)
-
-#define MC_MAX_EM_ITER 16
-#define MC_EM_EPS 1e-5
-#define MC_DEF_INDEL 0.15
-
-gzFile bcf_p1_fp_lk;
-
-unsigned char seq_nt4_table[256] = {
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
-};
-
-struct __bcf_p1aux_t {
- int n, M, n1, is_indel;
- uint8_t *ploidy; // haploid or diploid ONLY
- double *q2p, *pdg; // pdg -> P(D|g)
- double *phi, *phi_indel;
- double *z, *zswap; // aux for afs
- double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
- double **hg; // hypergeometric distribution
- double *lf; // log factorial
- double t, t1, t2;
- double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
- const uint8_t *PL; // point to PL
- int PL_len;
-};
-
-void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
-{
- int i;
- for (i = 0; i < ma->M; ++i)
- ma->phi_indel[i] = ma->phi[i] * x;
- ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x;
-}
-
-static void init_prior(int type, double theta, int M, double *phi)
-{
- int i;
- if (type == MC_PTYPE_COND2) {
- for (i = 0; i <= M; ++i)
- phi[i] = 2. * (i + 1) / (M + 1) / (M + 2);
- } else if (type == MC_PTYPE_FLAT) {
- for (i = 0; i <= M; ++i)
- phi[i] = 1. / (M + 1);
- } else {
- double sum;
- for (i = 0, sum = 0.; i < M; ++i)
- sum += (phi[i] = theta / (M - i));
- phi[M] = 1. - sum;
- }
-}
-
-void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
-{
- init_prior(type, theta, ma->M, ma->phi);
- bcf_p1_indel_prior(ma, MC_DEF_INDEL);
-}
-
-void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta)
-{
- if (ma->n1 <= 0 || ma->n1 >= ma->M) return;
- init_prior(type, theta, 2*ma->n1, ma->phi1);
- init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2);
-}
-
-int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn)
-{
- gzFile fp;
- kstring_t s;
- kstream_t *ks;
- long double sum;
- int dret, k;
- memset(&s, 0, sizeof(kstring_t));
- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- ks = ks_init(fp);
- memset(ma->phi, 0, sizeof(double) * (ma->M + 1));
- while (ks_getuntil(ks, '\n', &s, &dret) >= 0) {
- if (strstr(s.s, "[afs] ") == s.s) {
- char *p = s.s + 6;
- for (k = 0; k <= ma->M; ++k) {
- int x;
- double y;
- x = strtol(p, &p, 10);
- if (x != k && (errno == EINVAL || errno == ERANGE)) return -1;
- ++p;
- y = strtod(p, &p);
- if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1;
- ma->phi[ma->M - k] += y;
- }
- }
- }
- ks_destroy(ks);
- gzclose(fp);
- free(s.s);
- for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k];
- fprintf(stderr, "[prior]");
- for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum;
- for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]);
- fputc('\n', stderr);
- for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1));
- fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum);
- for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M;
- fprintf(stderr, "theta=%lf\n", (double)sum);
- bcf_p1_indel_prior(ma, MC_DEF_INDEL);
- return 0;
-}
-
-bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy)
-{
- bcf_p1aux_t *ma;
- int i;
- ma = calloc(1, sizeof(bcf_p1aux_t));
- ma->n1 = -1;
- ma->n = n; ma->M = 2 * n;
- if (ploidy) {
- ma->ploidy = malloc(n);
- memcpy(ma->ploidy, ploidy, n);
- for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i];
- if (ma->M == 2 * n) {
- free(ma->ploidy);
- ma->ploidy = 0;
- }
- }
- ma->q2p = calloc(256, sizeof(double));
- ma->pdg = calloc(3 * ma->n, sizeof(double));
- ma->phi = calloc(ma->M + 1, sizeof(double));
- ma->phi_indel = calloc(ma->M + 1, sizeof(double));
- ma->phi1 = calloc(ma->M + 1, sizeof(double));
- ma->phi2 = calloc(ma->M + 1, sizeof(double));
- ma->z = calloc(ma->M + 1, sizeof(double));
- ma->zswap = calloc(ma->M + 1, sizeof(double));
- ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
- ma->z2 = calloc(ma->M + 1, sizeof(double));
- ma->afs = calloc(ma->M + 1, sizeof(double));
- ma->afs1 = calloc(ma->M + 1, sizeof(double));
- ma->lf = calloc(ma->M + 1, sizeof(double));
- for (i = 0; i < 256; ++i)
- ma->q2p[i] = pow(10., -i / 10.);
- for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1);
- bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
- return ma;
-}
-
-int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; }
-
-int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
-{
- if (n1 == 0 || n1 >= b->n) return -1;
- if (b->M != b->n * 2) {
- fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
- return -1;
- }
- b->n1 = n1;
- return 0;
-}
-
-void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma)
-{
- // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper.
- // Ideally, this should set ploidy per site to allow pseudo-autosomal regions
- b->ploidy = ma->ploidy;
-}
-
-void bcf_p1_destroy(bcf_p1aux_t *ma)
-{
- if (ma) {
- int k;
- free(ma->lf);
- if (ma->hg && ma->n1 > 0) {
- for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
- free(ma->hg);
- }
- free(ma->ploidy); free(ma->q2p); free(ma->pdg);
- free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
- free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
- free(ma->afs); free(ma->afs1);
- free(ma);
- }
-}
-
-extern double kf_gammap(double s, double z);
-int test16(bcf1_t *b, anno16_t *a);
-
-// Wigginton 2005, PMID: 15789306
-// written by Jan Wigginton
-double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets)
-{
- if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1;
-
- assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0);
-
- int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1;
- int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2;
-
- int rare_copies = 2 * obs_homr + obs_hets;
- int genotypes = obs_hets + obs_homc + obs_homr;
-
- double *het_probs = (double*) calloc(rare_copies+1, sizeof(double));
-
- /* start at midpoint */
- int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
-
- /* check to ensure that midpoint and rare alleles have same parity */
- if ((rare_copies & 1) ^ (mid & 1)) mid++;
-
- int curr_hets = mid;
- int curr_homr = (rare_copies - mid) / 2;
- int curr_homc = genotypes - curr_hets - curr_homr;
-
- het_probs[mid] = 1.0;
- double sum = het_probs[mid];
- for (curr_hets = mid; curr_hets > 1; curr_hets -= 2)
- {
- het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0));
- sum += het_probs[curr_hets - 2];
-
- /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */
- curr_homr++;
- curr_homc++;
- }
-
- curr_hets = mid;
- curr_homr = (rare_copies - mid) / 2;
- curr_homc = genotypes - curr_hets - curr_homr;
- for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2)
- {
- het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0));
- sum += het_probs[curr_hets + 2];
-
- /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */
- curr_homr--;
- curr_homc--;
- }
- int i;
- for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum;
-
- /* p-value calculation for p_hwe */
- double p_hwe = 0.0;
- for (i = 0; i <= rare_copies; i++)
- {
- if (het_probs[i] > het_probs[obs_hets])
- continue;
- p_hwe += het_probs[i];
- }
-
- p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
- free(het_probs);
- return p_hwe;
-
-}
-
-
-static void _bcf1_set_ref(bcf1_t *b, int idp)
-{
- kstring_t s;
- int old_n_gi = b->n_gi;
- s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
- kputs(":GT", &s); kputc('\0', &s);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- bcf_sync(b);
-
- // Call GTs
- int isample, an = 0;
- for (isample = 0; isample < b->n_smpl; isample++)
- {
- if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 )
- ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7;
- else
- {
- ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0;
- an += b->ploidy ? b->ploidy[isample] : 2;
- }
- }
- bcf_fit_alt(b,1);
- b->qual = 999;
-
- // Prepare BCF for output: ref, alt, filter, info, format
- memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s);
- kputs(b->ref, &s); kputc('\0', &s);
- kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
- {
- ksprintf(&s, "AN=%d;", an);
- kputs(b->info, &s);
- anno16_t a;
- int has_I16 = test16(b, &a) >= 0? 1 : 0;
- if (has_I16 )
- {
- if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
- ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
- }
- kputc('\0', &s);
- rm_info(&s, "I16=");
- rm_info(&s, "QS=");
- }
- kputs(b->fmt, &s); kputc('\0', &s);
- free(b->str);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- bcf_sync(b);
-}
-
-int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only)
-{
- int nals = 1;
- char *p;
- for (p=b->alt; *p; p++)
- {
- if ( *p=='X' || p[0]=='.' ) break;
- if ( p[0]==',' ) nals++;
- }
- if ( b->alt[0] && !*p ) nals++;
-
- if ( nals>4 )
- {
- if ( *b->ref=='N' ) return 0;
- fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt);
- exit(1);
- }
-
- // find PL, DV and DP FORMAT indexes
- uint8_t *pl = NULL;
- int i, npl = 0, idp = -1, idv = -1;
- for (i = 0; i < b->n_gi; ++i)
- {
- if (b->gi[i].fmt == bcf_str2int("PL", 2))
- {
- pl = (uint8_t*)b->gi[i].data;
- npl = b->gi[i].len;
- }
- else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i;
- else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i;
- }
- if ( nals==1 )
- {
- if ( !var_only ) _bcf1_set_ref(b, idp);
- return 1;
- }
- if ( !pl ) return -1;
-
- assert(ma->q2p[0] == 1);
-
- // Init P(D|G)
- int npdg = nals*(nals+1)/2;
- double *pdg,*_pdg;
- _pdg = pdg = malloc(sizeof(double)*ma->n*npdg);
- for (i=0; i<ma->n; i++)
- {
- int j;
- double sum = 0;
- for (j=0; j<npdg; j++)
- {
- //_pdg[j] = pow(10,-0.1*pl[j]);
- _pdg[j] = ma->q2p[pl[j]];
- sum += _pdg[j];
- }
- if ( sum )
- for (j=0; j<npdg; j++) _pdg[j] /= sum;
- _pdg += npdg;
- pl += npl;
- }
-
- if ((p = strstr(b->info, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); }
- double qsum[4];
- if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); }
-
-
- // Calculate the most likely combination of alleles, remembering the most and second most likely set
- int ia,ib,ic, max_als=0, max_als2=0;
- double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3];
- for (ia=0; ia<nals; ia++)
- {
- double lk_tot = 0;
- int iaa = (ia+1)*(ia+2)/2-1;
- int isample;
- for (isample=0; isample<ma->n; isample++)
- {
- double *p = pdg + isample*npdg;
- // assert( log(p[iaa]) <= 0 );
- lk_tot += log(p[iaa]);
- }
- if ( ia==0 ) ref_lk = lk_tot;
- if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia; }
- else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia; }
- lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
- }
- lk_sums[0] = lk_sum;
- if ( nals>1 )
- {
- for (ia=0; ia<nals; ia++)
- {
- if ( qsum[ia]==0 ) continue;
- int iaa = (ia+1)*(ia+2)/2-1;
- for (ib=0; ib<ia; ib++)
- {
- if ( qsum[ib]==0 ) continue;
- double lk_tot = 0;
- double fa = qsum[ia]/(qsum[ia]+qsum[ib]);
- double fb = qsum[ib]/(qsum[ia]+qsum[ib]);
- double fab = 2*fa*fb; fa *= fa; fb *= fb;
- int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
- for (isample=0; isample<ma->n; isample++)
- {
- double *p = pdg + isample*npdg;
- //assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 );
- if ( b->ploidy && b->ploidy[isample]==1 )
- lk_tot += log(fa*p[iaa] + fb*p[ibb]);
- else
- lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]);
- }
- if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib; }
- else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib; }
- lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
- }
- }
- lk_sums[1] = lk_sum;
- }
- if ( nals>2 )
- {
- for (ia=0; ia<nals; ia++)
- {
- if ( qsum[ia]==0 ) continue;
- int iaa = (ia+1)*(ia+2)/2-1;
- for (ib=0; ib<ia; ib++)
- {
- if ( qsum[ib]==0 ) continue;
- int ibb = (ib+1)*(ib+2)/2-1;
- int iab = iaa - ia + ib;
- for (ic=0; ic<ib; ic++)
- {
- if ( qsum[ic]==0 ) continue;
- double lk_tot = 0;
- double fa = qsum[ia]/(qsum[ia]+qsum[ib]+qsum[ic]);
- double fb = qsum[ib]/(qsum[ia]+qsum[ib]+qsum[ic]);
- double fc = qsum[ic]/(qsum[ia]+qsum[ib]+qsum[ic]);
- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
- int isample, icc = (ic+1)*(ic+2)/2-1;
- int iac = iaa - ia + ic, ibc = ibb - ib + ic;
- for (isample=0; isample<ma->n; isample++)
- {
- double *p = pdg + isample*npdg;
- //assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 );
- if ( b->ploidy && b->ploidy[isample]==1 )
- lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc]);
- else
- lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]);
- }
- if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib|1<<ic; }
- else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib|1<<ic; }
- lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
- }
- }
- }
- lk_sums[2] = lk_sum;
- }
-
- // Should we add another allele, does it increase the likelihood significantly?
- int n1=0, n2=0;
- for (i=0; i<nals; i++) if ( max_als&1<<i) n1++;
- for (i=0; i<nals; i++) if ( max_als2&1<<i) n2++;
- if ( n2<n1 && kf_gammap(1,2.0*(max_lk-max_lk2))<threshold )
- {
- // the threshold not exceeded, use the second most likely set with fewer alleles
- max_lk = max_lk2;
- max_als = max_als2;
- n1 = n2;
- }
- lk_sum = lk_sums[n1-1];
-
- // Get the BCF record ready for GT and GQ
- kstring_t s;
- int old_n_gi = b->n_gi;
- s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
- kputs(":GT:GQ", &s); kputc('\0', &s);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- bcf_sync(b);
-
- // Call GTs
- int isample, gts=0, ac[4] = {0,0,0,0};
- int nRR = 0, nAA = 0, nRA = 0, max_dv = 0;
- for (isample = 0; isample < b->n_smpl; isample++)
- {
- int ploidy = b->ploidy ? b->ploidy[isample] : 2;
- double *p = pdg + isample*npdg;
- int ia, als = 0;
- double lk = 0, lk_s = 0;
- for (ia=0; ia<nals; ia++)
- {
- if ( !(max_als&1<<ia) ) continue;
- int iaa = (ia+1)*(ia+2)/2-1;
- double _lk = p[iaa]*qsum[ia]*qsum[ia];
- if ( _lk > lk ) { lk = _lk; als = ia<<3 | ia; }
- lk_s += _lk;
- }
- if ( ploidy==2 )
- {
- for (ia=0; ia<nals; ia++)
- {
- if ( !(max_als&1<<ia) ) continue;
- int iaa = (ia+1)*(ia+2)/2-1;
- for (ib=0; ib<ia; ib++)
- {
- if ( !(max_als&1<<ib) ) continue;
- int iab = iaa - ia + ib;
- double _lk = 2*qsum[ia]*qsum[ib]*p[iab];
- if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; }
- lk_s += _lk;
- }
- }
- }
- lk = -log(1-lk/lk_s)/0.2302585;
- int dp = 0;
- if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 )
- {
- // no coverage
- ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7;
- ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0;
- continue;
- }
- if ( lk>99 ) lk = 99;
- ((uint8_t*)b->gi[old_n_gi].data)[isample] = als;
- ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk;
-
- // For MDV annotation
- int dv;
- if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) )
- {
- if ( max_dv < dv ) max_dv = dv;
- }
-
- // For HWE annotation; multiple ALT alleles treated as one
- if ( !als ) nRR++;
- else if ( !(als>>3&7) || !(als&7) ) nRA++;
- else nAA++;
-
- gts |= 1<<(als>>3&7) | 1<<(als&7);
- ac[ als>>3&7 ]++;
- ac[ als&7 ]++;
- }
- free(pdg);
- bcf_fit_alt(b,max_als);
-
- // The VCF spec is ambiguous about QUAL: is it the probability of anything else
- // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it
- // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter.
- b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum));
- if ( b->qual>999 ) b->qual = 999;
-
- // Prepare BCF for output: ref, alt, filter, info, format
- memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s);
- kputs(b->ref, &s); kputc('\0', &s);
- kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
- {
- int an=0, nalts=0;
- for (i=0; i<nals; i++)
- {
- an += ac[i];
- if ( i>0 && ac[i] ) nalts++;
- }
- ksprintf(&s, "AN=%d;", an);
- if ( nalts )
- {
- kputs("AC=", &s);
- for (i=1; i<nals; i++)
- {
- if ( !(gts&1<<i) ) continue;
- nalts--;
- ksprintf(&s,"%d", ac[i]);
- if ( nalts>0 ) kputc(',', &s);
- }
- kputc(';', &s);
- }
- kputs(b->info, &s);
- anno16_t a;
- int has_I16 = test16(b, &a) >= 0? 1 : 0;
- if (has_I16 )
- {
- if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
- ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
- ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3]));
- if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv);
- }
- if ( nAA+nRA )
- {
- double hwe = calc_hwe(nAA, nRR, nRA);
- ksprintf(&s, ";HWE=%e", hwe);
- }
- kputc('\0', &s);
- rm_info(&s, "I16=");
- rm_info(&s, "QS=");
- }
- kputs(b->fmt, &s); kputc('\0', &s);
- free(b->str);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- bcf_sync(b);
-
- return gts;
-}
-
-static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
-{
- int i, j;
- long *p, tmp;
- p = alloca(b->n_alleles * sizeof(long));
- memset(p, 0, sizeof(long) * b->n_alleles);
- for (j = 0; j < ma->n; ++j) {
- const uint8_t *pi = ma->PL + j * ma->PL_len;
- double *pdg = ma->pdg + j * 3;
- pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
- for (i = 0; i < b->n_alleles; ++i)
- p[i] += (int)pi[(i+1)*(i+2)/2-1];
- }
- for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
- for (i = 1; i < b->n_alleles; ++i) // insertion sort
- for (j = i; j > 0 && p[j] < p[j-1]; --j)
- tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
- for (i = b->n_alleles - 1; i >= 0; --i)
- if ((p[i]&0xf) == 0) break;
- return i;
-}
-
-
-int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
-{
- double sum, g[3];
- double max, f3[3], *pdg = ma->pdg + k * 3;
- int q, i, max_i, ploidy;
- ploidy = ma->ploidy? ma->ploidy[k] : 2;
- if (ploidy == 2) {
- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
- } else {
- f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0;
- }
- for (i = 0, sum = 0.; i < 3; ++i)
- sum += (g[i] = pdg[i] * f3[i]);
- for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
- g[i] /= sum;
- if (g[i] > max) max = g[i], max_i = i;
- }
- max = 1. - max;
- if (max < 1e-308) max = 1e-308;
- q = (int)(-4.343 * log(max) + .499);
- if (q > 99) q = 99;
- return q<<2|max_i;
-}
-
-#define TINY 1e-20
-
-static void mc_cal_y_core(bcf_p1aux_t *ma, int beg)
-{
- double *z[2], *tmp, *pdg;
- int _j, last_min, last_max;
- assert(beg == 0 || ma->M == ma->n*2);
- z[0] = ma->z;
- z[1] = ma->zswap;
- pdg = ma->pdg;
- memset(z[0], 0, sizeof(double) * (ma->M + 1));
- memset(z[1], 0, sizeof(double) * (ma->M + 1));
- z[0][0] = 1.;
- last_min = last_max = 0;
- ma->t = 0.;
- if (ma->M == ma->n * 2) {
- int M = 0;
- for (_j = beg; _j < ma->n; ++_j) {
- int k, j = _j - beg, _min = last_min, _max = last_max, M0;
- double p[3], sum;
- M0 = M; M += 2;
- pdg = ma->pdg + _j * 3;
- p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
- for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
- for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
- _max += 2;
- if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
- if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
- for (k = _min < 2? 2 : _min; k <= _max; ++k)
- z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
- for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
- ma->t += log(sum / (M * (M - 1.)));
- for (k = _min; k <= _max; ++k) z[1][k] /= sum;
- if (_min >= 1) z[1][_min-1] = 0.;
- if (_min >= 2) z[1][_min-2] = 0.;
- if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
- if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset
- ma->t1 = ma->t;
- memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
- }
- tmp = z[0]; z[0] = z[1]; z[1] = tmp;
- last_min = _min; last_max = _max;
- }
- //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary?
- //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.;
- } else { // this block is very similar to the block above; these two might be merged in future
- int j, M = 0;
- for (j = 0; j < ma->n; ++j) {
- int k, M0, _min = last_min, _max = last_max;
- double p[3], sum;
- pdg = ma->pdg + j * 3;
- for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
- for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
- M0 = M;
- M += ma->ploidy[j];
- if (ma->ploidy[j] == 1) {
- p[0] = pdg[0]; p[1] = pdg[2];
- _max++;
- if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k];
- for (k = _min < 1? 1 : _min; k <= _max; ++k)
- z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1];
- for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
- ma->t += log(sum / M);
- for (k = _min; k <= _max; ++k) z[1][k] /= sum;
- if (_min >= 1) z[1][_min-1] = 0.;
- if (j < ma->n - 1) z[1][_max+1] = 0.;
- } else if (ma->ploidy[j] == 2) {
- p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2];
- _max += 2;
- if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
- if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
- for (k = _min < 2? 2 : _min; k <= _max; ++k)
- z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
- for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
- ma->t += log(sum / (M * (M - 1.)));
- for (k = _min; k <= _max; ++k) z[1][k] /= sum;
- if (_min >= 1) z[1][_min-1] = 0.;
- if (_min >= 2) z[1][_min-2] = 0.;
- if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
- }
- tmp = z[0]; z[0] = z[1]; z[1] = tmp;
- last_min = _min; last_max = _max;
- }
- }
- if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
- if (bcf_p1_fp_lk)
- gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
-}
-
-static void mc_cal_y(bcf_p1aux_t *ma)
-{
- if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
- int k;
- long double x;
- memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
- memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
- ma->t1 = ma->t2 = 0.;
- mc_cal_y_core(ma, ma->n1);
- ma->t2 = ma->t;
- memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
- mc_cal_y_core(ma, 0);
- // rescale z
- x = expl(ma->t - (ma->t1 + ma->t2));
- for (k = 0; k <= ma->M; ++k) ma->z[k] *= x;
- } else mc_cal_y_core(ma, 0);
-}
-
-#define CONTRAST_TINY 1e-30
-
-extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
-
-static inline double chi2_test(int a, int b, int c, int d)
-{
- double x, z;
- x = (double)(a+b) * (c+d) * (b+d) * (a+c);
- if (x == 0.) return 1;
- z = a * d - b * c;
- return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
-}
-
-// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
-static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3])
-{
- double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
- int n1 = p1->n1, n2 = p1->n - p1->n1;
- if (p < CONTRAST_TINY) return -1;
- if (.5*k1/n1 < .5*k2/n2) x[1] += p;
- else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
- else x[0] += p;
- return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
-}
-
-static double contrast2(bcf_p1aux_t *p1, double ret[3])
-{
- int k, k1, k2, k10, k20, n1, n2;
- double sum;
- // get n1 and n2
- n1 = p1->n1; n2 = p1->n - p1->n1;
- if (n1 <= 0 || n2 <= 0) return 0.;
- if (p1->hg == 0) { // initialize the hypergeometric distribution
- /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
- to avoid precomputing this matrix, but it is slower and quite intricate. The following
- computation in this block can be accelerated with a similar strategy, but perhaps this
- is not a serious concern for now. */
- double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
- p1->hg = calloc(2*n1+1, sizeof(void*));
- for (k1 = 0; k1 <= 2*n1; ++k1) {
- p1->hg[k1] = calloc(2*n2+1, sizeof(double));
- for (k2 = 0; k2 <= 2*n2; ++k2)
- p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
- }
- }
- { // compute
- long double suml = 0;
- for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
- sum = suml;
- }
- { // get the max k1 and k2
- double max;
- int max_k;
- for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
- double x = p1->phi1[k] * p1->z1[k];
- if (x > max) max = x, max_k = k;
- }
- k10 = max_k;
- for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
- double x = p1->phi2[k] * p1->z2[k];
- if (x > max) max = x, max_k = k;
- }
- k20 = max_k;
- }
- { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
- double x[3], y;
- long double z = 0., L[2];
- x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0;
- for (k1 = k10; k1 >= 0; --k1) {
- for (k2 = k20; k2 >= 0; --k2) {
- if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
- else z += y;
- }
- for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
- if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
- else z += y;
- }
- }
- ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
- x[0] = x[1] = x[2] = 0;
- for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
- for (k2 = k20; k2 >= 0; --k2) {
- if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
- else z += y;
- }
- for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
- if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
- else z += y;
- }
- }
- ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
- if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened
- ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0;
- for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
- for (k2 = 0; k2 <= 2*n2; ++k2)
- if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y;
- if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why...
- z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
- }
- return (double)z;
- }
-}
-
-static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
-{
- int k;
- long double sum = 0., sum2;
- double *phi = ma->is_indel? ma->phi_indel : ma->phi;
- memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
- mc_cal_y(ma);
- // compute AFS
- for (k = 0, sum = 0.; k <= ma->M; ++k)
- sum += (long double)phi[k] * ma->z[k];
- for (k = 0; k <= ma->M; ++k) {
- ma->afs1[k] = phi[k] * ma->z[k] / sum;
- if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
- }
- // compute folded variant probability
- for (k = 0, sum = 0.; k <= ma->M; ++k)
- sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
- for (k = 1, sum2 = 0.; k < ma->M; ++k)
- sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
- *p_var_folded = sum2 / sum;
- *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum;
- // the expected frequency
- for (k = 0, sum = 0.; k <= ma->M; ++k) {
- ma->afs[k] += ma->afs1[k];
- sum += k * ma->afs1[k];
- }
- return sum / ma->M;
-}
-
-int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
-{
- int i, k;
- long double sum = 0.;
- ma->is_indel = bcf_is_indel(b);
- rst->perm_rank = -1;
- // set PL and PL_len
- for (i = 0; i < b->n_gi; ++i) {
- if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
- ma->PL = (uint8_t*)b->gi[i].data;
- ma->PL_len = b->gi[i].len;
- break;
- }
- }
- if (i == b->n_gi) return -1; // no PL
- if (b->n_alleles < 2) return -1; // FIXME: find a better solution
- //
- rst->rank0 = cal_pdg(b, ma);
- rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded);
- rst->p_ref = ma->afs1[ma->M];
- for (k = 0, sum = 0.; k < ma->M; ++k)
- sum += ma->afs1[k];
- rst->p_var = (double)sum;
- { // compute the allele count
- double max = -1;
- rst->ac = -1;
- for (k = 0; k <= ma->M; ++k)
- if (max < ma->z[k]) max = ma->z[k], rst->ac = k;
- rst->ac = ma->M - rst->ac;
- }
- // calculate f_flat and f_em
- for (k = 0, sum = 0.; k <= ma->M; ++k)
- sum += (long double)ma->z[k];
- rst->f_flat = 0.;
- for (k = 0; k <= ma->M; ++k) {
- double p = ma->z[k] / sum;
- rst->f_flat += k * p;
- }
- rst->f_flat /= ma->M;
- { // estimate equal-tail credible interval (95% level)
- int l, h;
- double p;
- for (i = 0, p = 0.; i <= ma->M; ++i)
- if (p + ma->afs1[i] > 0.025) break;
- else p += ma->afs1[i];
- l = i;
- for (i = ma->M, p = 0.; i >= 0; --i)
- if (p + ma->afs1[i] > 0.025) break;
- else p += ma->afs1[i];
- h = i;
- rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
- }
- if (ma->n1 > 0) { // compute LRT
- double max0, max1, max2;
- for (k = 0, max0 = -1; k <= ma->M; ++k)
- if (max0 < ma->z[k]) max0 = ma->z[k];
- for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k)
- if (max1 < ma->z1[k]) max1 = ma->z1[k];
- for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k)
- if (max2 < ma->z2[k]) max2 = ma->z2[k];
- rst->lrt = log(max1 * max2 / max0);
- rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt);
- } else rst->lrt = -1.0;
- rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
- if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
- rst->p_chi2 = contrast2(ma, rst->cmp);
- return 0;
-}
-
-void bcf_p1_dump_afs(bcf_p1aux_t *ma)
-{
- int k;
- fprintf(stderr, "[afs]");
- for (k = 0; k <= ma->M; ++k)
- fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
- fprintf(stderr, "\n");
- memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
-}
diff --git a/sam/bcftools/prob1.h b/sam/bcftools/prob1.h
deleted file mode 100644
index 6f93155..0000000
--- a/sam/bcftools/prob1.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef BCF_PROB1_H
-#define BCF_PROB1_H
-
-#include "bcf.h"
-
-struct __bcf_p1aux_t;
-typedef struct __bcf_p1aux_t bcf_p1aux_t;
-
-typedef struct {
- int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
- int ac; // ML alternative allele count
- double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
- double cil, cih;
- double cmp[3], p_chi2, lrt; // used by contrast2()
-} bcf_p1rst_t;
-
-typedef struct {
- double p[4];
- int mq, depth, is_tested, d[4];
-} anno16_t;
-
-#define MC_PTYPE_FULL 1
-#define MC_PTYPE_COND2 2
-#define MC_PTYPE_FLAT 3
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy);
- void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
- void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
- void bcf_p1_destroy(bcf_p1aux_t *ma);
- void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma);
- int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
- int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only);
- int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
- void bcf_p1_dump_afs(bcf_p1aux_t *ma);
- int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
- int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
- void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
-
- int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/bcftools/vcf.c b/sam/bcftools/vcf.c
deleted file mode 100644
index e8526a3..0000000
--- a/sam/bcftools/vcf.c
+++ /dev/null
@@ -1,249 +0,0 @@
-#include <zlib.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "bcf.h"
-#include "kstring.h"
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 4096)
-
-typedef struct {
- gzFile fp;
- FILE *fpout;
- kstream_t *ks;
- void *refhash;
- kstring_t line;
- int max_ref;
-} vcf_t;
-
-bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
-{
- kstring_t meta, smpl;
- int dret;
- vcf_t *v;
- bcf_hdr_t *h;
- if (!bp->is_vcf) return bcf_hdr_read(bp);
- h = calloc(1, sizeof(bcf_hdr_t));
- v = (vcf_t*)bp->v;
- v->line.l = 0;
- memset(&meta, 0, sizeof(kstring_t));
- memset(&smpl, 0, sizeof(kstring_t));
- while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
- if (v->line.l < 2) continue;
- if (v->line.s[0] != '#') {
- free(meta.s);
- free(smpl.s);
- free(h);
- return 0; // no sample line
- }
- if (v->line.s[0] == '#' && v->line.s[1] == '#') {
- kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
- } else if (v->line.s[0] == '#') {
- int k;
- ks_tokaux_t aux;
- char *p;
- for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
- if (k >= 9) {
- kputsn(p, aux.p - p, &smpl);
- kputc('\0', &smpl);
- }
- }
- break;
- }
- }
- kputc('\0', &meta);
- h->name = 0;
- h->sname = smpl.s; h->l_smpl = smpl.l;
- h->txt = meta.s; h->l_txt = meta.l;
- bcf_hdr_sync(h);
- return h;
-}
-
-bcf_t *vcf_open(const char *fn, const char *mode)
-{
- bcf_t *bp;
- vcf_t *v;
- if (strchr(mode, 'b')) return bcf_open(fn, mode);
- bp = calloc(1, sizeof(bcf_t));
- v = calloc(1, sizeof(vcf_t));
- bp->is_vcf = 1;
- bp->v = v;
- v->refhash = bcf_str2id_init();
- if (strchr(mode, 'r')) {
- v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- v->ks = ks_init(v->fp);
- } else if (strchr(mode, 'w'))
- v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
- return bp;
-}
-
-int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
-{
- vcf_t *v;
- gzFile fp;
- kstream_t *ks;
- kstring_t s, rn;
- int dret;
- if (bp == 0) return -1;
- if (!bp->is_vcf) return 0;
- s.l = s.m = 0; s.s = 0;
- rn.m = rn.l = h->l_nm; rn.s = h->name;
- v = (vcf_t*)bp->v;
- fp = gzopen(fn, "r");
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
- bcf_str2id_add(v->refhash, strdup(s.s));
- kputs(s.s, &rn); kputc('\0', &rn);
- if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
- }
- ks_destroy(ks);
- gzclose(fp);
- h->l_nm = rn.l; h->name = rn.s;
- bcf_hdr_sync(h);
- free(s.s);
- return 0;
-}
-
-int vcf_close(bcf_t *bp)
-{
- vcf_t *v;
- if (bp == 0) return -1;
- if (!bp->is_vcf) return bcf_close(bp);
- v = (vcf_t*)bp->v;
- if (v->fp) {
- ks_destroy(v->ks);
- gzclose(v->fp);
- }
- if (v->fpout) fclose(v->fpout);
- free(v->line.s);
- bcf_str2id_thorough_destroy(v->refhash);
- free(v);
- free(bp);
- return 0;
-}
-
-int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
-{
- vcf_t *v = (vcf_t*)bp->v;
- int i, has_ver = 0;
- if (!bp->is_vcf) return bcf_hdr_write(bp, h);
- if (h->l_txt > 0) {
- if (strstr(h->txt, "##fileformat=")) has_ver = 1;
- if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
- fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
- }
- if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
- fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
- for (i = 0; i < h->n_smpl; ++i)
- fprintf(v->fpout, "\t%s", h->sns[i]);
- fputc('\n', v->fpout);
- return 0;
-}
-
-int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
-{
- vcf_t *v = (vcf_t*)bp->v;
- extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
- if (!bp->is_vcf) return bcf_write(bp, h, b);
- bcf_fmt_core(h, b, &v->line);
- fwrite(v->line.s, 1, v->line.l, v->fpout);
- fputc('\n', v->fpout);
- return v->line.l + 1;
-}
-
-int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
-{
- int dret, k, i, sync = 0;
- vcf_t *v = (vcf_t*)bp->v;
- char *p, *q;
- kstring_t str, rn;
- ks_tokaux_t aux, a2;
- if (!bp->is_vcf) return bcf_read(bp, h, b);
- v->line.l = 0;
- str.l = 0; str.m = b->m_str; str.s = b->str;
- rn.l = rn.m = h->l_nm; rn.s = h->name;
- if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
- b->n_smpl = h->n_smpl;
- for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
- *(char*)aux.p = 0;
- if (k == 0) { // ref
- int tid = bcf_str2id(v->refhash, p);
- if (tid < 0) {
- tid = bcf_str2id_add(v->refhash, strdup(p));
- kputs(p, &rn); kputc('\0', &rn);
- sync = 1;
- }
- b->tid = tid;
- } else if (k == 1) { // pos
- b->pos = atoi(p) - 1;
- } else if (k == 5) { // qual
- b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
- } else if (k <= 8) { // variable length strings
- kputs(p, &str); kputc('\0', &str);
- b->l_str = str.l; b->m_str = str.m; b->str = str.s;
- if (k == 8) bcf_sync(b);
- } else { // k > 9
- if (strncmp(p, "./.", 3) == 0) {
- for (i = 0; i < b->n_gi; ++i) {
- if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
- ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
- ((uint8_t*)b->gi[i].data)[k-9] = 0;
- } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
- ((int32_t*)b->gi[i].data)[k-9] = 0;
- } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
- ((uint16_t*)b->gi[i].data)[k-9] = 0;
- } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
- int y = b->n_alleles * (b->n_alleles + 1) / 2;
- memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
- } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
- int y = b->n_alleles * (b->n_alleles + 1) / 2;
- memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
- }
- }
- goto endblock;
- }
- for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
- if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
- ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
- double _x = strtod(q, &q);
- int x = (int)(_x + .499);
- if (x > 255) x = 255;
- ((uint8_t*)b->gi[i].data)[k-9] = x;
- } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
- int x = strtol(q, &q, 10);
- if (x > 0xffff) x = 0xffff;
- ((uint32_t*)b->gi[i].data)[k-9] = x;
- } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
- int x = strtol(q, &q, 10);
- if (x > 0xffff) x = 0xffff;
- ((uint16_t*)b->gi[i].data)[k-9] = x;
- } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
- int x, y, j;
- uint8_t *data = (uint8_t*)b->gi[i].data;
- y = b->n_alleles * (b->n_alleles + 1) / 2;
- for (j = 0; j < y; ++j) {
- x = strtol(q, &q, 10);
- if (x > 255) x = 255;
- data[(k-9) * y + j] = x;
- ++q;
- }
- } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
- int j, y;
- float x, *data = (float*)b->gi[i].data;
- y = b->n_alleles * (b->n_alleles + 1) / 2;
- for (j = 0; j < y; ++j) {
- x = strtod(q, &q);
- data[(k-9) * y + j] = x > 0? -x/10. : x;
- ++q;
- }
- }
- }
- endblock: i = i;
- }
- }
- h->l_nm = rn.l; h->name = rn.s;
- if (sync) bcf_hdr_sync(h);
- return v->line.l + 1;
-}
diff --git a/sam/bcftools/vcfutils.pl b/sam/bcftools/vcfutils.pl
deleted file mode 100755
index 2b7ba0b..0000000
--- a/sam/bcftools/vcfutils.pl
+++ /dev/null
@@ -1,567 +0,0 @@
-#!/usr/bin/perl -w
-
-# Author: lh3
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-&main;
-exit;
-
-sub main {
- &usage if (@ARGV < 1);
- my $command = shift(@ARGV);
- my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
- hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
- gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
- die("Unknown command \"$command\".\n") if (!defined($func{$command}));
- &{$func{$command}};
-}
-
-sub splitchr {
- my %opts = (l=>5000000);
- getopts('l:', \%opts);
- my $l = $opts{l};
- die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
- while (<>) {
- my @t = split;
- my $last = 0;
- for (my $i = 0; $i < $t[1];) {
- my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
- print "$t[0]:".($i+1)."-$e\n";
- $i = $e;
- }
- }
-}
-
-sub subsam {
- die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
- my ($fh, %h);
- my $fn = shift(@ARGV);
- my @col;
- open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
- $h{$_} = 1 for (@ARGV);
- while (<$fh>) {
- if (/^##/) {
- print;
- } elsif (/^#/) {
- my @t = split;
- my @s = @t[0..8]; # all fixed fields + FORMAT
- for (9 .. $#t) {
- if ($h{$t[$_]}) {
- push(@s, $t[$_]);
- push(@col, $_);
- }
- }
- pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
- print join("\t", @s), "\n";
- } else {
- my @t = split;
- if (@col == 0) {
- print join("\t", @t[0..7]), "\n";
- } else {
- print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
- }
- }
- }
- close($fh);
-}
-
-sub listsam {
- die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
- while (<>) {
- if (/^#/ && !/^##/) {
- my @t = split;
- print join("\n", @t[9..$#t]), "\n";
- exit;
- }
- }
-}
-
-sub fillac {
- die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
- while (<>) {
- if (/^#/) {
- print;
- } else {
- my @t = split;
- my @c = (0, 0);
- my $n = 0;
- my $s = -1;
- @_ = split(":", $t[8]);
- for (0 .. $#_) {
- if ($_[$_] eq 'GT') { $s = $_; last; }
- }
- if ($s < 0) {
- print join("\t", @t), "\n";
- next;
- }
- for (9 .. $#t) {
- if ($t[$_] =~ /^0,0,0/) {
- } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
- ++$c[$2]; ++$c[$3];
- $n += 2;
- }
- }
- my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
- my $info = $t[7];
- $info =~ s/(;?)AC=(\d+)//;
- $info =~ s/(;?)AN=(\d+)//;
- if ($info eq '.') {
- $info = $AC;
- } else {
- $info .= ";$AC";
- }
- $t[7] = $info;
- print join("\t", @t), "\n";
- }
- }
-}
-
-sub ldstats {
- my %opts = (t=>0.9);
- getopts('t:', \%opts);
- die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
- my $cutoff = $opts{t};
- my ($last, $lastchr) = (0x7fffffff, '');
- my ($x, $y, $n) = (0, 0, 0);
- while (<>) {
- if (/^([^#\s]+)\s(\d+)/) {
- my ($chr, $pos) = ($1, $2);
- if (/NEIR=([\d\.]+)/) {
- ++$n;
- ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
- }
- $last = $pos; $lastchr = $chr;
- }
- }
- print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
- print "Fraction: ", $y/$n, "\n";
- print "Length: $x\n";
-}
-
-sub qstats {
- my %opts = (r=>'', s=>0.02, v=>undef);
- getopts('r:s:v', \%opts);
- die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
-Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
- my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
- my %h = ();
- my $is_vcf = defined($opts{v})? 1 : 0;
- if ($opts{r}) { # read the reference positions
- my $fh;
- open($fh, $opts{r}) || die;
- while (<$fh>) {
- next if (/^#/);
- if ($is_vcf) {
- my @t = split;
- $h{$t[0],$t[1]} = $t[4];
- } else {
- $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
- }
- }
- close($fh);
- }
- my $hsize = scalar(keys %h);
- my @a;
- while (<>) {
- next if (/^#/);
- my @t = split;
- next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
- $t[3] = uc($t[3]); $t[4] = uc($t[4]);
- my @s = split(',', $t[4]);
- $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
- next if (length($s[0]) != 1);
- my $hit;
- if ($is_vcf) {
- $hit = 0;
- my $aa = $h{$t[0],$t[1]};
- if (defined($aa)) {
- my @aaa = split(",", $aa);
- for (@aaa) {
- $hit = 1 if ($_ eq $s[0]);
- }
- }
- } else {
- $hit = defined($h{$t[0],$t[1]})? 1 : 0;
- }
- push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
- }
- push(@a, [-1, 0, 0, 0]); # end marker
- die("[qstats] No SNP data!\n") if (@a == 0);
- @a = sort {$b->[0]<=>$a->[0]} @a;
- my $next = $opts{s};
- my $last = $a[0];
- my @c = (0, 0, 0, 0);
- my @lc;
- $lc[1] = $lc[2] = 0;
- for my $p (@a) {
- if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
- my @x;
- $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
- $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
- $x[2] = sprintf("%.4f", $c[3] / $c[1]);
- my $a = $c[1] - $lc[1];
- my $b = $c[2] - $lc[2];
- $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
- print join("\t", $last, @c, @x), "\n";
- $next = $c[0]/@a + $opts{s};
- $lc[1] = $c[1]; $lc[2] = $c[2];
- }
- ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
- $last = $p->[0];
- }
-}
-
-sub varFilter {
- my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
- getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
- die(qq/
-Usage: vcfutils.pl varFilter [options] <in.vcf>
-
-Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
- -d INT minimum read depth [$opts{d}]
- -D INT maximum read depth [$opts{D}]
- -a INT minimum number of alternate bases [$opts{a}]
- -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
- -W INT window size for filtering adjacent gaps [$opts{W}]
- -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
- -2 FLOAT min P-value for baseQ bias [$opts{2}]
- -3 FLOAT min P-value for mapQ bias [$opts{3}]
- -4 FLOAT min P-value for end distance bias [$opts{4}]
- -e FLOAT min P-value for HWE (plus F<0) [$opts{e}]
- -p print filtered variants
-
-Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
-\n/) if (@ARGV == 0 && -t STDIN);
-
- # calculate the window size
- my ($ol, $ow) = ($opts{W}, $opts{w});
- my $max_dist = $ol > $ow? $ol : $ow;
- # the core loop
- my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
- while (<>) {
- my @t = split;
- if (/^#/) {
- print; next;
- }
- next if ($t[4] eq '.'); # skip non-var sites
- next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
- # check if the site is a SNP
- my $type = 1; # SNP
- if (length($t[3]) > 1) {
- $type = 2; # MNP
- my @s = split(',', $t[4]);
- for (@s) {
- $type = 3 if (length != length($t[3]));
- }
- } else {
- my @s = split(',', $t[4]);
- for (@s) {
- $type = 3 if (length > 1);
- }
- }
- # clear the out-of-range elements
- while (@staging) {
- # Still on the same chromosome and the first element's window still affects this position?
- last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
- varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
- }
- my $flt = 0;
- # parse annotations
- my ($dp, $mq, $dp_alt) = (-1, -1, -1);
- if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
- $dp = $1 + $2 + $3 + $4;
- $dp_alt = $3 + $4;
- }
- if ($t[7] =~ /DP=(\d+)/i) {
- $dp = $1;
- }
- $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
- # the depth and mapQ filter
- if ($dp >= 0) {
- if ($dp < $opts{d}) {
- $flt = 2;
- } elsif ($dp > $opts{D}) {
- $flt = 3;
- }
- }
- $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
- $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
- $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
- && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
- $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
- # HWE filter
- if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
- my $p = 2*$1 + $2;
- my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
- $flt = 9 if ($f < 0);
- }
-
- my $score = $t[5] * 100 + $dp_alt;
- my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
- if ($flt == 0) {
- if ($type == 3) { # an indel
- # filtering SNPs and MNPs
- for my $x (@staging) {
- next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
- $x->[1] = 5;
- }
- # check the staging list for indel filtering
- for my $x (@staging) {
- next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
- if ($x->[0]>>2 < $score) {
- $x->[1] = 6;
- } else {
- $flt = 6; last;
- }
- }
- } else { # SNP or MNP
- for my $x (@staging) {
- next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
- if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
- && length($x->[7]) - length($x->[6]) == 1) {
- $x->[1] = 5;
- } else { $flt = 5; }
- last;
- }
- # check MNP
- for my $x (@staging) {
- next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
- if ($x->[0]>>2 < $score) {
- $x->[1] = 8;
- } else {
- $flt = 8; last;
- }
- }
- }
- }
- push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
- }
- # output the last few elements in the staging list
- while (@staging) {
- varFilter_aux(shift @staging, $opts{p});
- }
-}
-
-sub varFilter_aux {
- my ($first, $is_print) = @_;
- if ($first->[1] == 0) {
- print join("\t", @$first[3 .. @$first-1]), "\n";
- } elsif ($is_print) {
- print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
- }
-}
-
-sub gapstats {
- my (@c0, @c1);
- $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
- while (<>) {
- next if (/^#/);
- my @t = split;
- next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
- my @s = split(',', $t[4]);
- for my $x (@s) {
- my $l = length($x) - length($t[3]) + 5000;
- if ($x =~ /^-/) {
- $l = -(length($x) - 1) + 5000;
- } elsif ($x =~ /^\+/) {
- $l = length($x) - 1 + 5000;
- }
- $c0[$l] += 1 / @s;
- }
- }
- for (my $i = 0; $i < 10000; ++$i) {
- next if ($c0[$i] == 0);
- $c1[0] += $c0[$i];
- $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
- printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
- }
- printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
-}
-
-sub ucscsnp2vcf {
- die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
- print "##fileformat=VCFv4.0\n";
- print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
- while (<>) {
- my @t = split("\t");
- my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
- my $pos = $t[2] + 1;
- my @alt;
- push(@alt, $t[7]);
- if ($t[6] eq '-') {
- $t[9] = reverse($t[9]);
- $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
- }
- my @a = split("/", $t[9]);
- for (@a) {
- push(@alt, $_) if ($_ ne $alt[0]);
- }
- if ($indel) {
- --$pos;
- for (0 .. $#alt) {
- $alt[$_] =~ tr/-//d;
- $alt[$_] = "N$alt[$_]";
- }
- }
- my $ref = shift(@alt);
- my $af = $t[13] > 0? ";AF=$t[13]" : '';
- my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
- my $info = "molType=$t[10];class=$t[11]$valid$af";
- print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
- }
-}
-
-sub hapmap2vcf {
- die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
- my $fn = shift(@ARGV);
- # parse UCSC SNP
- warn("Parsing UCSC SNPs...\n");
- my ($fh, %map);
- open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
- while (<$fh>) {
- my @t = split;
- next if ($t[3] - $t[2] != 1); # not SNP
- @{$map{$t[4]}} = @t[1,3,7];
- }
- close($fh);
- # write VCF
- warn("Writing VCF...\n");
- print "##fileformat=VCFv4.0\n";
- while (<>) {
- my @t = split;
- if ($t[0] eq 'rs#') { # the first line
- print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
- } else {
- next unless ($map{$t[0]});
- next if (length($t[1]) != 3); # skip non-SNPs
- my $a = \@{$map{$t[0]}};
- my $ref = $a->[2];
- my @u = split('/', $t[1]);
- if ($u[1] eq $ref) {
- $u[1] = $u[0]; $u[0] = $ref;
- } elsif ($u[0] ne $ref) { next; }
- my $alt = $u[1];
- my %w;
- $w{$u[0]} = 0; $w{$u[1]} = 1;
- my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
- my $is_tri = 0;
- for (@t[11..$#t]) {
- if ($_ eq 'NN') {
- push(@s, './.');
- } else {
- my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
- if (!defined($a[0]) || !defined($a[1])) {
- $is_tri = 1;
- last;
- }
- push(@s, "$a[0]/$a[1]");
- }
- }
- next if ($is_tri);
- print join("\t", @s), "\n";
- }
- }
-}
-
-sub vcf2fq {
- my %opts = (d=>3, D=>100000, Q=>10, l=>5);
- getopts('d:D:Q:l:', \%opts);
- die(qq/
-Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
-
-Options: -d INT minimum depth [$opts{d}]
- -D INT maximum depth [$opts{D}]
- -Q INT min RMS mapQ [$opts{Q}]
- -l INT INDEL filtering window [$opts{l}]
-\n/) if (@ARGV == 0 && -t STDIN);
-
- my ($last_chr, $seq, $qual, $last_pos, @gaps);
- my $_Q = $opts{Q};
- my $_d = $opts{d};
- my $_D = $opts{D};
-
- my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
- GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
-
- $last_chr = '';
- while (<>) {
- next if (/^#/);
- my @t = split;
- if ($last_chr ne $t[0]) {
- &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
- ($last_chr, $last_pos) = ($t[0], 0);
- $seq = $qual = '';
- @gaps = ();
- }
- die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
- if ($t[1] - $last_pos > 1) {
- $seq .= 'n' x ($t[1] - $last_pos - 1);
- $qual .= '!' x ($t[1] - $last_pos - 1);
- }
- if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
- my ($ref, $alt) = ($t[3], $1);
- my ($b, $q);
- $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
- if ($q < 0) {
- $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
- $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
- $q = -$q;
- } else {
- $b = $het{"$ref$alt"};
- $b ||= 'N';
- }
- $b = lc($b);
- $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
- $q = int($q + 33 + .499);
- $q = chr($q <= 126? $q : 126);
- $seq .= $b;
- $qual .= $q;
- } elsif ($t[4] ne '.') { # an INDEL
- push(@gaps, [$t[1], length($t[3])]);
- }
- $last_pos = $t[1];
- }
- &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
-}
-
-sub v2q_post_process {
- my ($chr, $seq, $qual, $gaps, $l) = @_;
- for my $g (@$gaps) {
- my $beg = $g->[0] > $l? $g->[0] - $l : 0;
- my $end = $g->[0] + $g->[1] + $l;
- $end = length($$seq) if ($end > length($$seq));
- substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
- }
- print "\@$chr\n"; &v2q_print_str($seq);
- print "+\n"; &v2q_print_str($qual);
-}
-
-sub v2q_print_str {
- my ($s) = @_;
- my $l = length($$s);
- for (my $i = 0; $i < $l; $i += 60) {
- print substr($$s, $i, 60), "\n";
- }
-}
-
-sub usage {
- die(qq/
-Usage: vcfutils.pl <command> [<arguments>]\n
-Command: subsam get a subset of samples
- listsam list the samples
- fillac fill the allele count field
- qstats SNP stats stratified by QUAL
-
- hapmap2vcf convert the hapmap format to VCF
- ucscsnp2vcf convert UCSC SNP SQL dump to VCF
-
- varFilter filtering short variants (*)
- vcf2fq VCF->fastq (**)
-
-Notes: Commands with description endting with (*) may need bcftools
- specific annotations.
-\n/);
-}
diff --git a/sam/bedcov.c b/sam/bedcov.c
deleted file mode 100644
index 3e4b952..0000000
--- a/sam/bedcov.c
+++ /dev/null
@@ -1,127 +0,0 @@
-#include <zlib.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include "kstring.h"
-#include "bgzf.h"
-#include "bam.h"
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 16384)
-
-typedef struct {
- bamFile fp;
- bam_iter_t iter;
- int min_mapQ;
-} aux_t;
-
-static int read_bam(void *data, bam1_t *b)
-{
- aux_t *aux = (aux_t*)data;
- int ret = bam_iter_read(aux->fp, aux->iter, b);
- if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
- return ret;
-}
-
-int main_bedcov(int argc, char *argv[])
-{
- extern void bam_init_header_hash(bam_header_t*);
- gzFile fp;
- kstring_t str;
- kstream_t *ks;
- bam_index_t **idx;
- bam_header_t *h = 0;
- aux_t **aux;
- int *n_plp, dret, i, n, c, min_mapQ = 0;
- int64_t *cnt;
- const bam_pileup1_t **plp;
-
- while ((c = getopt(argc, argv, "Q:")) >= 0) {
- switch (c) {
- case 'Q': min_mapQ = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
- return 1;
- }
- memset(&str, 0, sizeof(kstring_t));
- n = argc - optind - 1;
- aux = calloc(n, sizeof(void*));
- idx = calloc(n, sizeof(void*));
- for (i = 0; i < n; ++i) {
- aux[i] = calloc(1, sizeof(aux_t));
- aux[i]->min_mapQ = min_mapQ;
- aux[i]->fp = bam_open(argv[i+optind+1], "r");
- idx[i] = bam_index_load(argv[i+optind+1]);
- if (aux[i]->fp == 0 || idx[i] == 0) {
- fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
- return 2;
- }
- bgzf_set_cache_size(aux[i]->fp, 20);
- if (i == 0) h = bam_header_read(aux[0]->fp);
- }
- bam_init_header_hash(h);
- cnt = calloc(n, 8);
-
- fp = gzopen(argv[optind], "rb");
- ks = ks_init(fp);
- n_plp = calloc(n, sizeof(int));
- plp = calloc(n, sizeof(void*));
- while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
- char *p, *q;
- int tid, beg, end, pos;
- bam_mplp_t mplp;
-
- for (p = q = str.s; *p && *p != '\t'; ++p);
- if (*p != '\t') goto bed_error;
- *p = 0; tid = bam_get_tid(h, q); *p = '\t';
- if (tid < 0) goto bed_error;
- for (q = p = p + 1; isdigit(*p); ++p);
- if (*p != '\t') goto bed_error;
- *p = 0; beg = atoi(q); *p = '\t';
- for (q = p = p + 1; isdigit(*p); ++p);
- if (*p == '\t' || *p == 0) {
- int c = *p;
- *p = 0; end = atoi(q); *p = c;
- } else goto bed_error;
-
- for (i = 0; i < n; ++i) {
- if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
- aux[i]->iter = bam_iter_query(idx[i], tid, beg, end);
- }
- mplp = bam_mplp_init(n, read_bam, (void**)aux);
- bam_mplp_set_maxcnt(mplp, 64000);
- memset(cnt, 0, 8 * n);
- while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
- if (pos >= beg && pos < end)
- for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
- for (i = 0; i < n; ++i) {
- kputc('\t', &str);
- kputl(cnt[i], &str);
- }
- puts(str.s);
- bam_mplp_destroy(mplp);
- continue;
-
-bed_error:
- fprintf(stderr, "Errors in BED line '%s'\n", str.s);
- }
- free(n_plp); free(plp);
- ks_destroy(ks);
- gzclose(fp);
-
- free(cnt);
- for (i = 0; i < n; ++i) {
- if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
- bam_index_destroy(idx[i]);
- bam_close(aux[i]->fp);
- free(aux[i]);
- }
- bam_header_destroy(h);
- free(aux); free(idx);
- free(str.s);
- return 0;
-}
diff --git a/sam/bedidx.c b/sam/bedidx.c
deleted file mode 100644
index ec75a10..0000000
--- a/sam/bedidx.c
+++ /dev/null
@@ -1,162 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include <zlib.h>
-
-#ifdef _WIN32
-#define drand48() ((double)rand() / RAND_MAX)
-#endif
-
-#include "ksort.h"
-KSORT_INIT_GENERIC(uint64_t)
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 8192)
-
-typedef struct {
- int n, m;
- uint64_t *a;
- int *idx;
-} bed_reglist_t;
-
-#include "khash.h"
-KHASH_MAP_INIT_STR(reg, bed_reglist_t)
-
-#define LIDX_SHIFT 13
-
-typedef kh_reg_t reghash_t;
-
-int *bed_index_core(int n, uint64_t *a, int *n_idx)
-{
- int i, j, m, *idx;
- m = *n_idx = 0; idx = 0;
- for (i = 0; i < n; ++i) {
- int beg, end;
- beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
- if (m < end + 1) {
- int oldm = m;
- m = end + 1;
- kroundup32(m);
- idx = realloc(idx, m * sizeof(int));
- for (j = oldm; j < m; ++j) idx[j] = -1;
- }
- if (beg == end) {
- if (idx[beg] < 0) idx[beg] = i;
- } else {
- for (j = beg; j <= end; ++j)
- if (idx[j] < 0) idx[j] = i;
- }
- *n_idx = end + 1;
- }
- return idx;
-}
-
-void bed_index(void *_h)
-{
- reghash_t *h = (reghash_t*)_h;
- khint_t k;
- for (k = 0; k < kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- bed_reglist_t *p = &kh_val(h, k);
- if (p->idx) free(p->idx);
- ks_introsort(uint64_t, p->n, p->a);
- p->idx = bed_index_core(p->n, p->a, &p->m);
- }
- }
-}
-
-int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
-{
- int i, min_off;
- if (p->n == 0) return 0;
- min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
- if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
- int n = beg>>LIDX_SHIFT;
- if (n > p->n) n = p->n;
- for (i = n - 1; i >= 0; --i)
- if (p->idx[i] >= 0) break;
- min_off = i >= 0? p->idx[i] : 0;
- }
- for (i = min_off; i < p->n; ++i) {
- if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
- if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
- return 1; // find the overlap; return
- }
- return 0;
-}
-
-int bed_overlap(const void *_h, const char *chr, int beg, int end)
-{
- const reghash_t *h = (const reghash_t*)_h;
- khint_t k;
- if (!h) return 0;
- k = kh_get(reg, h, chr);
- if (k == kh_end(h)) return 0;
- return bed_overlap_core(&kh_val(h, k), beg, end);
-}
-
-void *bed_read(const char *fn)
-{
- reghash_t *h = kh_init(reg);
- gzFile fp;
- kstream_t *ks;
- int dret;
- kstring_t *str;
- // read the list
- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- if (fp == 0) return 0;
- str = calloc(1, sizeof(kstring_t));
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
- int beg = -1, end = -1;
- bed_reglist_t *p;
- khint_t k = kh_get(reg, h, str->s);
- if (k == kh_end(h)) { // absent from the hash table
- int ret;
- char *s = strdup(str->s);
- k = kh_put(reg, h, s, &ret);
- memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
- }
- p = &kh_val(h, k);
- if (dret != '\n') { // if the lines has other characters
- if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
- beg = atoi(str->s); // begin
- if (dret != '\n') {
- if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
- end = atoi(str->s); // end
- if (end < beg) end = -1;
- }
- }
- }
- }
- if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
- if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
- if (beg >= 0 && end > beg) {
- if (p->n == p->m) {
- p->m = p->m? p->m<<1 : 4;
- p->a = realloc(p->a, p->m * 8);
- }
- p->a[p->n++] = (uint64_t)beg<<32 | end;
- }
- }
- ks_destroy(ks);
- gzclose(fp);
- free(str->s); free(str);
- bed_index(h);
- return h;
-}
-
-void bed_destroy(void *_h)
-{
- reghash_t *h = (reghash_t*)_h;
- khint_t k;
- for (k = 0; k < kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- free(kh_val(h, k).a);
- free(kh_val(h, k).idx);
- free((char*)kh_key(h, k));
- }
- }
- kh_destroy(reg, h);
-}
diff --git a/sam/bgzf.c b/sam/bgzf.c
deleted file mode 100644
index 880d5af..0000000
--- a/sam/bgzf.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
- 2011 Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include "bgzf.h"
-
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-typedef knetFile *_bgzf_file_t;
-#define _bgzf_open(fn, mode) knet_open(fn, mode)
-#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
-#define _bgzf_close(fp) knet_close(fp)
-#define _bgzf_fileno(fp) ((fp)->fd)
-#define _bgzf_tell(fp) knet_tell(fp)
-#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
-#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
-#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
-#else // ~defined(_USE_KNETFILE)
-#if defined(_WIN32) || defined(_MSC_VER)
-#define ftello(fp) ftell(fp)
-#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
-#else // ~defined(_WIN32)
-extern off_t ftello(FILE *stream);
-extern int fseeko(FILE *stream, off_t offset, int whence);
-#endif // ~defined(_WIN32)
-typedef FILE *_bgzf_file_t;
-#define _bgzf_open(fn, mode) fopen(fn, mode)
-#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
-#define _bgzf_close(fp) fclose(fp)
-#define _bgzf_fileno(fp) fileno(fp)
-#define _bgzf_tell(fp) ftello(fp)
-#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
-#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
-#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
-#endif // ~define(_USE_KNETFILE)
-
-#define BLOCK_HEADER_LENGTH 18
-#define BLOCK_FOOTER_LENGTH 8
-
-
-/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
- +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
- | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
- +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
-*/
-static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
-
-#ifdef BGZF_CACHE
-typedef struct {
- int size;
- uint8_t *block;
- int64_t end_offset;
-} cache_t;
-#include "khash.h"
-KHASH_MAP_INIT_INT64(cache, cache_t)
-#endif
-
-static inline void packInt16(uint8_t *buffer, uint16_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
-}
-
-static inline int unpackInt16(const uint8_t *buffer)
-{
- return buffer[0] | buffer[1] << 8;
-}
-
-static inline void packInt32(uint8_t *buffer, uint32_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
- buffer[2] = value >> 16;
- buffer[3] = value >> 24;
-}
-
-static BGZF *bgzf_read_init()
-{
- BGZF *fp;
- fp = calloc(1, sizeof(BGZF));
- fp->is_write = 0;
- fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
- fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
-#ifdef BGZF_CACHE
- fp->cache = kh_init(cache);
-#endif
- return fp;
-}
-
-static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
-{
- BGZF *fp;
- fp = calloc(1, sizeof(BGZF));
- fp->is_write = 1;
- fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
- fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
- fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
- if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
- return fp;
-}
-// get the compress level from the mode string
-static int mode2level(const char *__restrict mode)
-{
- int i, compress_level = -1;
- for (i = 0; mode[i]; ++i)
- if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = (int)mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = 0;
- return compress_level;
-}
-
-BGZF *bgzf_open(const char *path, const char *mode)
-{
- BGZF *fp = 0;
- assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
- if (strchr(mode, 'r') || strchr(mode, 'R')) {
- _bgzf_file_t fpr;
- if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
- fp = bgzf_read_init();
- fp->fp = fpr;
- } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
- FILE *fpw;
- if ((fpw = fopen(path, "w")) == 0) return 0;
- fp = bgzf_write_init(mode2level(mode));
- fp->fp = fpw;
- }
- return fp;
-}
-
-BGZF *bgzf_dopen(int fd, const char *mode)
-{
- BGZF *fp = 0;
- assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
- if (strchr(mode, 'r') || strchr(mode, 'R')) {
- _bgzf_file_t fpr;
- if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
- fp = bgzf_read_init();
- fp->fp = fpr;
- } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
- FILE *fpw;
- if ((fpw = fdopen(fd, "w")) == 0) return 0;
- fp = bgzf_write_init(mode2level(mode));
- fp->fp = fpw;
- }
- return fp;
-}
-
-static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
-{
- uint32_t crc;
- z_stream zs;
- uint8_t *dst = (uint8_t*)_dst;
-
- // compress the body
- zs.zalloc = NULL; zs.zfree = NULL;
- zs.next_in = src;
- zs.avail_in = slen;
- zs.next_out = dst + BLOCK_HEADER_LENGTH;
- zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
- if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
- if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
- if (deflateEnd(&zs) != Z_OK) return -1;
- *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
- // write the header
- memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
- packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
- // write the footer
- crc = crc32(crc32(0L, NULL, 0L), src, slen);
- packInt32((uint8_t*)&dst[*dlen - 8], crc);
- packInt32((uint8_t*)&dst[*dlen - 4], slen);
- return 0;
-}
-
-// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
-static int deflate_block(BGZF *fp, int block_length)
-{
- int comp_size = BGZF_MAX_BLOCK_SIZE;
- if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) {
- fp->errcode |= BGZF_ERR_ZLIB;
- return -1;
- }
- fp->block_offset = 0;
- return comp_size;
-}
-
-// Inflate the block in fp->compressed_block into fp->uncompressed_block
-static int inflate_block(BGZF* fp, int block_length)
-{
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = fp->compressed_block + 18;
- zs.avail_in = block_length - 16;
- zs.next_out = fp->uncompressed_block;
- zs.avail_out = BGZF_MAX_BLOCK_SIZE;
-
- if (inflateInit2(&zs, -15) != Z_OK) {
- fp->errcode |= BGZF_ERR_ZLIB;
- return -1;
- }
- if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
- inflateEnd(&zs);
- fp->errcode |= BGZF_ERR_ZLIB;
- return -1;
- }
- if (inflateEnd(&zs) != Z_OK) {
- fp->errcode |= BGZF_ERR_ZLIB;
- return -1;
- }
- return zs.total_out;
-}
-
-static int check_header(const uint8_t *header)
-{
- return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
- && unpackInt16((uint8_t*)&header[10]) == 6
- && header[12] == 'B' && header[13] == 'C'
- && unpackInt16((uint8_t*)&header[14]) == 2);
-}
-
-#ifdef BGZF_CACHE
-static void free_cache(BGZF *fp)
-{
- khint_t k;
- khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- if (fp->is_write) return;
- for (k = kh_begin(h); k < kh_end(h); ++k)
- if (kh_exist(h, k)) free(kh_val(h, k).block);
- kh_destroy(cache, h);
-}
-
-static int load_block_from_cache(BGZF *fp, int64_t block_address)
-{
- khint_t k;
- cache_t *p;
- khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- k = kh_get(cache, h, block_address);
- if (k == kh_end(h)) return 0;
- p = &kh_val(h, k);
- if (fp->block_length != 0) fp->block_offset = 0;
- fp->block_address = block_address;
- fp->block_length = p->size;
- memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
- _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
- return p->size;
-}
-
-static void cache_block(BGZF *fp, int size)
-{
- int ret;
- khint_t k;
- cache_t *p;
- khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
- if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) {
- /* A better way would be to remove the oldest block in the
- * cache, but here we remove a random one for simplicity. This
- * should not have a big impact on performance. */
- for (k = kh_begin(h); k < kh_end(h); ++k)
- if (kh_exist(h, k)) break;
- if (k < kh_end(h)) {
- free(kh_val(h, k).block);
- kh_del(cache, h, k);
- }
- }
- k = kh_put(cache, h, fp->block_address, &ret);
- if (ret == 0) return; // if this happens, a bug!
- p = &kh_val(h, k);
- p->size = fp->block_length;
- p->end_offset = fp->block_address + size;
- p->block = malloc(BGZF_MAX_BLOCK_SIZE);
- memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
-}
-#else
-static void free_cache(BGZF *fp) {}
-static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
-static void cache_block(BGZF *fp, int size) {}
-#endif
-
-int bgzf_read_block(BGZF *fp)
-{
- uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
- int count, size = 0, block_length, remaining;
- int64_t block_address;
- block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
- if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
- count = _bgzf_read(fp->fp, header, sizeof(header));
- if (count == 0) { // no data read
- fp->block_length = 0;
- return 0;
- }
- if (count != sizeof(header) || !check_header(header)) {
- fp->errcode |= BGZF_ERR_HEADER;
- return -1;
- }
- size = count;
- block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
- compressed_block = (uint8_t*)fp->compressed_block;
- memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- remaining = block_length - BLOCK_HEADER_LENGTH;
- count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
- if (count != remaining) {
- fp->errcode |= BGZF_ERR_IO;
- return -1;
- }
- size += count;
- if ((count = inflate_block(fp, block_length)) < 0) return -1;
- if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
- fp->block_address = block_address;
- fp->block_length = count;
- cache_block(fp, size);
- return 0;
-}
-
-ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
-{
- ssize_t bytes_read = 0;
- uint8_t *output = data;
- if (length <= 0) return 0;
- assert(fp->is_write == 0);
- while (bytes_read < length) {
- int copy_length, available = fp->block_length - fp->block_offset;
- uint8_t *buffer;
- if (available <= 0) {
- if (bgzf_read_block(fp) != 0) return -1;
- available = fp->block_length - fp->block_offset;
- if (available <= 0) break;
- }
- copy_length = length - bytes_read < available? length - bytes_read : available;
- buffer = fp->uncompressed_block;
- memcpy(output, buffer + fp->block_offset, copy_length);
- fp->block_offset += copy_length;
- output += copy_length;
- bytes_read += copy_length;
- }
- if (fp->block_offset == fp->block_length) {
- fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
- fp->block_offset = fp->block_length = 0;
- }
- return bytes_read;
-}
-
-/***** BEGIN: multi-threading *****/
-
-typedef struct {
- BGZF *fp;
- struct mtaux_t *mt;
- void *buf;
- int i, errcode, toproc;
-} worker_t;
-
-typedef struct mtaux_t {
- int n_threads, n_blks, curr, done;
- volatile int proc_cnt;
- void **blk;
- int *len;
- worker_t *w;
- pthread_t *tid;
- pthread_mutex_t lock;
- pthread_cond_t cv;
-} mtaux_t;
-
-static int worker_aux(worker_t *w)
-{
- int i, tmp, stop = 0;
- // wait for condition: to process or all done
- pthread_mutex_lock(&w->mt->lock);
- while (!w->toproc && !w->mt->done)
- pthread_cond_wait(&w->mt->cv, &w->mt->lock);
- if (w->mt->done) stop = 1;
- w->toproc = 0;
- pthread_mutex_unlock(&w->mt->lock);
- if (stop) return 1; // to quit the thread
- w->errcode = 0;
- for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
- int clen = BGZF_MAX_BLOCK_SIZE;
- if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
- w->errcode |= BGZF_ERR_ZLIB;
- memcpy(w->mt->blk[i], w->buf, clen);
- w->mt->len[i] = clen;
- }
- tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1);
- return 0;
-}
-
-static void *mt_worker(void *data)
-{
- while (worker_aux(data) == 0);
- return 0;
-}
-
-int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
-{
- int i;
- mtaux_t *mt;
- pthread_attr_t attr;
- if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
- mt = calloc(1, sizeof(mtaux_t));
- mt->n_threads = n_threads;
- mt->n_blks = n_threads * n_sub_blks;
- mt->len = calloc(mt->n_blks, sizeof(int));
- mt->blk = calloc(mt->n_blks, sizeof(void*));
- for (i = 0; i < mt->n_blks; ++i)
- mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
- mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
- mt->w = calloc(mt->n_threads, sizeof(worker_t));
- for (i = 0; i < mt->n_threads; ++i) {
- mt->w[i].i = i;
- mt->w[i].mt = mt;
- mt->w[i].fp = fp;
- mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
- }
- pthread_attr_init(&attr);
- pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
- pthread_mutex_init(&mt->lock, 0);
- pthread_cond_init(&mt->cv, 0);
- for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
- pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
- fp->mt = mt;
- return 0;
-}
-
-static void mt_destroy(mtaux_t *mt)
-{
- int i;
- // signal all workers to quit
- pthread_mutex_lock(&mt->lock);
- mt->done = 1; mt->proc_cnt = 0;
- pthread_cond_broadcast(&mt->cv);
- pthread_mutex_unlock(&mt->lock);
- for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
- // free other data allocated on heap
- for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
- for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
- free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
- pthread_cond_destroy(&mt->cv);
- pthread_mutex_destroy(&mt->lock);
- free(mt);
-}
-
-static void mt_queue(BGZF *fp)
-{
- mtaux_t *mt = (mtaux_t*)fp->mt;
- assert(mt->curr < mt->n_blks); // guaranteed by the caller
- memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
- mt->len[mt->curr] = fp->block_offset;
- fp->block_offset = 0;
- ++mt->curr;
-}
-
-static int mt_flush(BGZF *fp)
-{
- int i;
- mtaux_t *mt = (mtaux_t*)fp->mt;
- if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
- // signal all the workers to compress
- pthread_mutex_lock(&mt->lock);
- for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
- mt->proc_cnt = 0;
- pthread_cond_broadcast(&mt->cv);
- pthread_mutex_unlock(&mt->lock);
- // worker 0 is doing things here
- worker_aux(&mt->w[0]);
- // wait for all the threads to complete
- while (mt->proc_cnt < mt->n_threads);
- // dump data to disk
- for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
- for (i = 0; i < mt->curr; ++i)
- if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i])
- fp->errcode |= BGZF_ERR_IO;
- mt->curr = 0;
- return 0;
-}
-
-static int mt_lazy_flush(BGZF *fp)
-{
- mtaux_t *mt = (mtaux_t*)fp->mt;
- if (fp->block_offset) mt_queue(fp);
- if (mt->curr == mt->n_blks)
- return mt_flush(fp);
- return -1;
-}
-
-static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length)
-{
- const uint8_t *input = data;
- ssize_t rest = length;
- while (rest) {
- int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest;
- memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length);
- fp->block_offset += copy_length; input += copy_length; rest -= copy_length;
- if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp);
- }
- return length - rest;
-}
-
-/***** END: multi-threading *****/
-
-int bgzf_flush(BGZF *fp)
-{
- if (!fp->is_write) return 0;
- if (fp->mt) return mt_flush(fp);
- while (fp->block_offset > 0) {
- int block_length;
- block_length = deflate_block(fp, fp->block_offset);
- if (block_length < 0) return -1;
- if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
- fp->errcode |= BGZF_ERR_IO; // possibly truncated file
- return -1;
- }
- fp->block_address += block_length;
- }
- return 0;
-}
-
-int bgzf_flush_try(BGZF *fp, ssize_t size)
-{
- if (fp->block_offset + size > BGZF_BLOCK_SIZE) {
- if (fp->mt) return mt_lazy_flush(fp);
- else return bgzf_flush(fp);
- }
- return -1;
-}
-
-ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
-{
- const uint8_t *input = data;
- int block_length = BGZF_BLOCK_SIZE, bytes_written = 0;
- assert(fp->is_write);
- if (fp->mt) return mt_write(fp, data, length);
- while (bytes_written < length) {
- uint8_t* buffer = fp->uncompressed_block;
- int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
- memcpy(buffer + fp->block_offset, input, copy_length);
- fp->block_offset += copy_length;
- input += copy_length;
- bytes_written += copy_length;
- if (fp->block_offset == block_length && bgzf_flush(fp)) break;
- }
- return bytes_written;
-}
-
-int bgzf_close(BGZF* fp)
-{
- int ret, count, block_length;
- if (fp == 0) return -1;
- if (fp->is_write) {
- if (bgzf_flush(fp) != 0) return -1;
- fp->compress_level = -1;
- block_length = deflate_block(fp, 0); // write an empty block
- count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
- if (fflush(fp->fp) != 0) {
- fp->errcode |= BGZF_ERR_IO;
- return -1;
- }
- if (fp->mt) mt_destroy(fp->mt);
- }
- ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp);
- if (ret != 0) return -1;
- free(fp->uncompressed_block);
- free(fp->compressed_block);
- free_cache(fp);
- free(fp);
- return 0;
-}
-
-void bgzf_set_cache_size(BGZF *fp, int cache_size)
-{
- if (fp) fp->cache_size = cache_size;
-}
-
-int bgzf_check_EOF(BGZF *fp)
-{
- static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
- uint8_t buf[28];
- off_t offset;
- offset = _bgzf_tell((_bgzf_file_t)fp->fp);
- if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
- _bgzf_read(fp->fp, buf, 28);
- _bgzf_seek(fp->fp, offset, SEEK_SET);
- return (memcmp(magic, buf, 28) == 0)? 1 : 0;
-}
-
-int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
-{
- int block_offset;
- int64_t block_address;
-
- if (fp->is_write || where != SEEK_SET) {
- fp->errcode |= BGZF_ERR_MISUSE;
- return -1;
- }
- block_offset = pos & 0xFFFF;
- block_address = pos >> 16;
- if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
- fp->errcode |= BGZF_ERR_IO;
- return -1;
- }
- fp->block_length = 0; // indicates current block has not been loaded
- fp->block_address = block_address;
- fp->block_offset = block_offset;
- return 0;
-}
-
-int bgzf_is_bgzf(const char *fn)
-{
- uint8_t buf[16];
- int n;
- _bgzf_file_t fp;
- if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
- n = _bgzf_read(fp, buf, 16);
- _bgzf_close(fp);
- if (n != 16) return 0;
- return memcmp(g_magic, buf, 16) == 0? 1 : 0;
-}
-
-int bgzf_getc(BGZF *fp)
-{
- int c;
- if (fp->block_offset >= fp->block_length) {
- if (bgzf_read_block(fp) != 0) return -2; /* error */
- if (fp->block_length == 0) return -1; /* end-of-file */
- }
- c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
- if (fp->block_offset == fp->block_length) {
- fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- return c;
-}
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
-{
- int l, state = 0;
- unsigned char *buf = (unsigned char*)fp->uncompressed_block;
- str->l = 0;
- do {
- if (fp->block_offset >= fp->block_length) {
- if (bgzf_read_block(fp) != 0) { state = -2; break; }
- if (fp->block_length == 0) { state = -1; break; }
- }
- for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
- if (l < fp->block_length) state = 1;
- l -= fp->block_offset;
- if (str->l + l + 1 >= str->m) {
- str->m = str->l + l + 2;
- kroundup32(str->m);
- str->s = (char*)realloc(str->s, str->m);
- }
- memcpy(str->s + str->l, buf + fp->block_offset, l);
- str->l += l;
- fp->block_offset += l + 1;
- if (fp->block_offset >= fp->block_length) {
- fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- } while (state == 0);
- if (str->l == 0 && state < 0) return state;
- str->s[str->l] = 0;
- return str->l;
-}
diff --git a/sam/bgzf.h b/sam/bgzf.h
deleted file mode 100644
index cb67681..0000000
--- a/sam/bgzf.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
- 2011, 2012 Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-/* The BGZF library was originally written by Bob Handsaker from the Broad
- * Institute. It was later improved by the SAMtools developers. */
-
-#ifndef __BGZF_H
-#define __BGZF_H
-
-#include <stdint.h>
-#include <stdio.h>
-#include <zlib.h>
-#include <sys/types.h>
-
-#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
-#define BGZF_MAX_BLOCK_SIZE 0x10000
-
-#define BGZF_ERR_ZLIB 1
-#define BGZF_ERR_HEADER 2
-#define BGZF_ERR_IO 4
-#define BGZF_ERR_MISUSE 8
-
-typedef struct {
- int errcode:16, is_write:2, compress_level:14;
- int cache_size;
- int block_length, block_offset;
- int64_t block_address;
- void *uncompressed_block, *compressed_block;
- void *cache; // a pointer to a hash table
- void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading
- void *mt; // only used for multi-threading
-} BGZF;
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
- size_t l, m;
- char *s;
-} kstring_t;
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /******************
- * Basic routines *
- ******************/
-
- /**
- * Open an existing file descriptor for reading or writing.
- *
- * @param fd file descriptor
- * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies
- * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored.
- * @return BGZF file handler; 0 on error
- */
- BGZF* bgzf_dopen(int fd, const char *mode);
-
- #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
-
- /**
- * Open the specified file for reading or writing.
- */
- BGZF* bgzf_open(const char* path, const char *mode);
-
- /**
- * Close the BGZF and free all associated resources.
- *
- * @param fp BGZF file handler
- * @return 0 on success and -1 on error
- */
- int bgzf_close(BGZF *fp);
-
- /**
- * Read up to _length_ bytes from the file storing into _data_.
- *
- * @param fp BGZF file handler
- * @param data data array to read into
- * @param length size of data to read
- * @return number of bytes actually read; 0 on end-of-file and -1 on error
- */
- ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length);
-
- /**
- * Write _length_ bytes from _data_ to the file.
- *
- * @param fp BGZF file handler
- * @param data data array to write
- * @param length size of data to write
- * @return number of bytes actually written; -1 on error
- */
- ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length);
-
- /**
- * Write the data in the buffer to the file.
- */
- int bgzf_flush(BGZF *fp);
-
- /**
- * Return a virtual file pointer to the current location in the file.
- * No interpetation of the value should be made, other than a subsequent
- * call to bgzf_seek can be used to position the file at the same point.
- * Return value is non-negative on success.
- */
- #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
-
- /**
- * Set the file to read from the location specified by _pos_.
- *
- * @param fp BGZF file handler
- * @param pos virtual file offset returned by bgzf_tell()
- * @param whence must be SEEK_SET
- * @return 0 on success and -1 on error
- */
- int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
-
- /**
- * Check if the BGZF end-of-file (EOF) marker is present
- *
- * @param fp BGZF file handler opened for reading
- * @return 1 if EOF is present; 0 if not or on I/O error
- */
- int bgzf_check_EOF(BGZF *fp);
-
- /**
- * Check if a file is in the BGZF format
- *
- * @param fn file name
- * @return 1 if _fn_ is BGZF; 0 if not or on I/O error
- */
- int bgzf_is_bgzf(const char *fn);
-
- /*********************
- * Advanced routines *
- *********************/
-
- /**
- * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
- *
- * @param fp BGZF file handler
- * @param size size of cache in bytes; 0 to disable caching (default)
- */
- void bgzf_set_cache_size(BGZF *fp, int size);
-
- /**
- * Flush the file if the remaining buffer size is smaller than _size_
- */
- int bgzf_flush_try(BGZF *fp, ssize_t size);
-
- /**
- * Read one byte from a BGZF file. It is faster than bgzf_read()
- * @param fp BGZF file handler
- * @return byte read; -1 on end-of-file or error
- */
- int bgzf_getc(BGZF *fp);
-
- /**
- * Read one line from a BGZF file. It is faster than bgzf_getc()
- *
- * @param fp BGZF file handler
- * @param delim delimitor
- * @param str string to write to; must be initialized
- * @return length of the string; 0 on end-of-file; negative on error
- */
- int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
-
- /**
- * Read the next BGZF block.
- */
- int bgzf_read_block(BGZF *fp);
-
- /**
- * Enable multi-threading (only effective on writing)
- *
- * @param fp BGZF file handler; must be opened for writing
- * @param n_threads #threads used for writing
- * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
- */
- int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/bgzip.c b/sam/bgzip.c
deleted file mode 100644
index ebcafa2..0000000
--- a/sam/bgzip.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/select.h>
-#include <sys/stat.h>
-#include "bgzf.h"
-
-static const int WINDOW_SIZE = 64 * 1024;
-
-static int bgzip_main_usage()
-{
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n");
- fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n");
- fprintf(stderr, " -d decompress\n");
- fprintf(stderr, " -f overwrite files without asking\n");
- fprintf(stderr, " -b INT decompress at virtual file pointer INT\n");
- fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n");
- fprintf(stderr, " -h give this help\n");
- fprintf(stderr, "\n");
- return 1;
-}
-
-static int write_open(const char *fn, int is_forced)
-{
- int fd = -1;
- char c;
- if (!is_forced) {
- if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
- fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
- scanf("%c", &c);
- if (c != 'Y' && c != 'y') {
- fprintf(stderr, "[bgzip] not overwritten\n");
- exit(1);
- }
- }
- }
- if (fd < 0) {
- if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
- fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
- exit(1);
- }
- }
- return fd;
-}
-
-static void fail(BGZF* fp)
-{
- fprintf(stderr, "Error: %s\n", fp->error);
- exit(1);
-}
-
-int main(int argc, char **argv)
-{
- int c, compress, pstdout, is_forced;
- BGZF *fp;
- void *buffer;
- long start, end, size;
-
- compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
- while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){
- switch(c){
- case 'h': return bgzip_main_usage();
- case 'd': compress = 0; break;
- case 'c': pstdout = 1; break;
- case 'b': start = atol(optarg); break;
- case 's': size = atol(optarg); break;
- case 'f': is_forced = 1; break;
- }
- }
- if (size >= 0) end = start + size;
- if (end >= 0 && end < start) {
- fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
- return 1;
- }
- if (compress == 1) {
- struct stat sbuf;
- int f_src = fileno(stdin);
- int f_dst = fileno(stdout);
-
- if ( argc>optind )
- {
- if ( stat(argv[optind],&sbuf)<0 )
- {
- fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
- return 1;
- }
-
- if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
- fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
- return 1;
- }
-
- if (pstdout)
- f_dst = fileno(stdout);
- else
- {
- char *name = malloc(strlen(argv[optind]) + 5);
- strcpy(name, argv[optind]);
- strcat(name, ".gz");
- f_dst = write_open(name, is_forced);
- if (f_dst < 0) return 1;
- free(name);
- }
- }
- else if (!pstdout && isatty(fileno((FILE *)stdout)) )
- return bgzip_main_usage();
-
- fp = bgzf_fdopen(f_dst, "w");
- buffer = malloc(WINDOW_SIZE);
- while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
- if (bgzf_write(fp, buffer, c) < 0) fail(fp);
- // f_dst will be closed here
- if (bgzf_close(fp) < 0) fail(fp);
- if (argc > optind && !pstdout) unlink(argv[optind]);
- free(buffer);
- close(f_src);
- return 0;
- } else {
- struct stat sbuf;
- int f_dst;
-
- if ( argc>optind )
- {
- if ( stat(argv[optind],&sbuf)<0 )
- {
- fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
- return 1;
- }
- char *name;
- int len = strlen(argv[optind]);
- if ( strcmp(argv[optind]+len-3,".gz") )
- {
- fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
- return 1;
- }
- fp = bgzf_open(argv[optind], "r");
- if (fp == NULL) {
- fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
- return 1;
- }
-
- if (pstdout) {
- f_dst = fileno(stdout);
- }
- else {
- name = strdup(argv[optind]);
- name[strlen(name) - 3] = '\0';
- f_dst = write_open(name, is_forced);
- free(name);
- }
- }
- else if (!pstdout && isatty(fileno((FILE *)stdin)) )
- return bgzip_main_usage();
- else
- {
- f_dst = fileno(stdout);
- fp = bgzf_fdopen(fileno(stdin), "r");
- if (fp == NULL) {
- fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
- return 1;
- }
- }
- buffer = malloc(WINDOW_SIZE);
- if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
- while (1) {
- if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
- else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
- if (c == 0) break;
- if (c < 0) fail(fp);
- start += c;
- write(f_dst, buffer, c);
- if (end >= 0 && start >= end) break;
- }
- free(buffer);
- if (bgzf_close(fp) < 0) fail(fp);
- if (!pstdout) unlink(argv[optind]);
- return 0;
- }
-}
diff --git a/sam/cut_target.c b/sam/cut_target.c
deleted file mode 100644
index 26f434f..0000000
--- a/sam/cut_target.c
+++ /dev/null
@@ -1,193 +0,0 @@
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include "bam.h"
-#include "errmod.h"
-#include "faidx.h"
-
-#define ERR_DEP 0.83f
-
-typedef struct {
- int e[2][3], p[2][2];
-} score_param_t;
-
-/* Note that although the two matrics have 10 parameters in total, only 4
- * (probably 3) are free. Changing the scoring matrices in a sort of symmetric
- * way will not change the result. */
-static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
-
-typedef struct {
- int min_baseQ, tid, max_bases;
- uint16_t *bases;
- bamFile fp;
- bam_header_t *h;
- char *ref;
- faidx_t *fai;
- errmod_t *em;
-} ct_t;
-
-static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
-{
- int i, j, ret, tmp, k, sum[4], qual;
- float q[16];
- if (n > g->max_bases) { // enlarge g->bases
- g->max_bases = n;
- kroundup32(g->max_bases);
- g->bases = realloc(g->bases, g->max_bases * 2);
- }
- for (i = k = 0; i < n; ++i) {
- const bam_pileup1_t *p = plp + i;
- uint8_t *seq;
- int q, baseQ, b;
- if (p->is_refskip || p->is_del) continue;
- baseQ = bam1_qual(p->b)[p->qpos];
- if (baseQ < g->min_baseQ) continue;
- seq = bam1_seq(p->b);
- b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
- if (b > 3) continue;
- q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
- if (q < 4) q = 4;
- if (q > 63) q = 63;
- g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
- }
- if (k == 0) return 0;
- errmod_cal(g->em, k, 4, g->bases, q);
- for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
- tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
- qual = (sum[1]>>2) - (sum[0]>>2);
- k = k < 256? k : 255;
- ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
- return ret<<8|k;
-}
-
-static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
-{
- int i, f[2][2], *prev, *curr, *swap_tmp, s;
- uint8_t *b; // backtrack array
- b = calloc(l, 1);
- f[0][0] = f[0][1] = 0;
- prev = f[0]; curr = f[1];
- // fill the backtrack matrix
- for (i = 0; i < l; ++i) {
- int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
- int tmp0, tmp1;
- // compute f[0]
- tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
- tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
- if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
- else curr[0] = tmp1, b[i] = 1;
- // compute f[1]
- tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
- tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
- if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
- else curr[1] = tmp1, b[i] |= 1<<1;
- // swap
- swap_tmp = prev; prev = curr; curr = swap_tmp;
- }
- // backtrack
- s = prev[0] > prev[1]? 0 : 1;
- for (i = l - 1; i > 0; --i) {
- b[i] |= s<<2;
- s = b[i]>>s&1;
- }
- // print
- for (i = 0, s = -1; i <= l; ++i) {
- if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
- if (s >= 0) {
- int j;
- printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
- for (j = s; j < i; ++j) {
- int c = cns[j]>>8;
- if (c == 0) putchar('N');
- else putchar("ACGT"[c&3]);
- }
- putchar('\t');
- for (j = s; j < i; ++j)
- putchar(33 + (cns[j]>>8>>2));
- putchar('\n');
- }
- //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
- s = -1;
- } else if ((b[i]>>2&3) && s < 0) s = i;
- }
- free(b);
-}
-
-static int read_aln(void *data, bam1_t *b)
-{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
- ct_t *g = (ct_t*)data;
- int ret, len;
- ret = bam_read1(g->fp, b);
- if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
- if (b->core.tid != g->tid) { // then load the sequence
- free(g->ref);
- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
- g->tid = b->core.tid;
- }
- bam_prob_realn_core(b, g->ref, 1<<1|1);
- }
- return ret;
-}
-
-int main_cut_target(int argc, char *argv[])
-{
- int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
- const bam_pileup1_t *p;
- bam_plp_t plp;
- uint16_t *cns;
- ct_t g;
-
- memset(&g, 0, sizeof(ct_t));
- g.min_baseQ = 13; g.tid = -1;
- while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
- switch (c) {
- case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
- case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
- case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
- case '1': g_param.e[1][1] = atoi(optarg); break;
- case '2': g_param.e[1][2] = atoi(optarg); break;
- case 'f': g.fai = fai_load(optarg);
- if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
- break;
- }
- }
- if (argc == optind) {
- fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
- return 1;
- }
- l = max_l = 0; cns = 0;
- g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- g.h = bam_header_read(g.fp);
- g.em = errmod_init(1 - ERR_DEP);
- plp = bam_plp_init(read_aln, &g);
- while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
- if (tid < 0) break;
- if (tid != lasttid) { // change of chromosome
- if (cns) process_cns(g.h, lasttid, l, cns);
- if (max_l < g.h->target_len[tid]) {
- max_l = g.h->target_len[tid];
- kroundup32(max_l);
- cns = realloc(cns, max_l * 2);
- }
- l = g.h->target_len[tid];
- memset(cns, 0, max_l * 2);
- lasttid = tid;
- }
- cns[pos] = gencns(&g, n, p);
- lastpos = pos;
- }
- process_cns(g.h, lasttid, l, cns);
- free(cns);
- bam_header_destroy(g.h);
- bam_plp_destroy(plp);
- bam_close(g.fp);
- if (g.fai) {
- fai_destroy(g.fai); free(g.ref);
- }
- errmod_destroy(g.em);
- free(g.bases);
- return 0;
-}
diff --git a/sam/errmod.c b/sam/errmod.c
deleted file mode 100644
index fba9a8d..0000000
--- a/sam/errmod.c
+++ /dev/null
@@ -1,130 +0,0 @@
-#include <math.h>
-#include "errmod.h"
-#include "ksort.h"
-KSORT_INIT_GENERIC(uint16_t)
-
-typedef struct __errmod_coef_t {
- double *fk, *beta, *lhet;
-} errmod_coef_t;
-
-typedef struct {
- double fsum[16], bsum[16];
- uint32_t c[16];
-} call_aux_t;
-
-static errmod_coef_t *cal_coef(double depcorr, double eta)
-{
- int k, n, q;
- long double sum, sum1;
- double *lC;
- errmod_coef_t *ec;
-
- ec = calloc(1, sizeof(errmod_coef_t));
- // initialize ->fk
- ec->fk = (double*)calloc(256, sizeof(double));
- ec->fk[0] = 1.0;
- for (n = 1; n != 256; ++n)
- ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
- // initialize ->coef
- ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
- lC = (double*)calloc(256 * 256, sizeof(double));
- for (n = 1; n != 256; ++n) {
- double lgn = lgamma(n+1);
- for (k = 1; k <= n; ++k)
- lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1);
- }
- for (q = 1; q != 64; ++q) {
- double e = pow(10.0, -q/10.0);
- double le = log(e);
- double le1 = log(1.0 - e);
- for (n = 1; n <= 255; ++n) {
- double *beta = ec->beta + (q<<16|n<<8);
- sum1 = sum = 0.0;
- for (k = n; k >= 0; --k, sum1 = sum) {
- sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
- beta[k] = -10. / M_LN10 * logl(sum1 / sum);
- }
- }
- }
- // initialize ->lhet
- ec->lhet = (double*)calloc(256 * 256, sizeof(double));
- for (n = 0; n < 256; ++n)
- for (k = 0; k < 256; ++k)
- ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
- free(lC);
- return ec;
-}
-
-errmod_t *errmod_init(float depcorr)
-{
- errmod_t *em;
- em = (errmod_t*)calloc(1, sizeof(errmod_t));
- em->depcorr = depcorr;
- em->coef = cal_coef(depcorr, 0.03);
- return em;
-}
-
-void errmod_destroy(errmod_t *em)
-{
- if (em == 0) return;
- free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
- free(em->coef); free(em);
-}
-// qual:6, strand:1, base:4
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
-{
- call_aux_t aux;
- int i, j, k, w[32];
-
- if (m > m) return -1;
- memset(q, 0, m * m * sizeof(float));
- if (n == 0) return 0;
- // calculate aux.esum and aux.fsum
- if (n > 255) { // then sample 255 bases
- ks_shuffle(uint16_t, n, bases);
- n = 255;
- }
- ks_introsort(uint16_t, n, bases);
- memset(w, 0, 32 * sizeof(int));
- memset(&aux, 0, sizeof(call_aux_t));
- for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
- uint16_t b = bases[j];
- int q = b>>5 < 4? 4 : b>>5;
- if (q > 63) q = 63;
- k = b&0x1f;
- aux.fsum[k&0xf] += em->coef->fk[w[k]];
- aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
- ++aux.c[k&0xf];
- ++w[k];
- }
- // generate likelihood
- for (j = 0; j != m; ++j) {
- float tmp1, tmp3;
- int tmp2, bar_e;
- // homozygous
- for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) {
- if (k == j) continue;
- tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
- }
- if (tmp2) {
- bar_e = (int)(tmp1 / tmp3 + 0.499);
- if (bar_e > 63) bar_e = 63;
- q[j*m+j] = tmp1;
- }
- // heterozygous
- for (k = j + 1; k < m; ++k) {
- int cjk = aux.c[j] + aux.c[k];
- for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
- if (i == j || i == k) continue;
- tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
- }
- if (tmp2) {
- bar_e = (int)(tmp1 / tmp3 + 0.499);
- if (bar_e > 63) bar_e = 63;
- q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
- } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
- }
- for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
- }
- return 0;
-}
diff --git a/sam/errmod.h b/sam/errmod.h
deleted file mode 100644
index 32c07b6..0000000
--- a/sam/errmod.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef ERRMOD_H
-#define ERRMOD_H
-
-#include <stdint.h>
-
-struct __errmod_coef_t;
-
-typedef struct {
- double depcorr;
- struct __errmod_coef_t *coef;
-} errmod_t;
-
-errmod_t *errmod_init(float depcorr);
-void errmod_destroy(errmod_t *em);
-
-/*
- n: number of bases
- m: maximum base
- bases[i]: qual:6, strand:1, base:4
- q[i*m+j]: phred-scaled likelihood of (i,j)
- */
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
-
-#endif
diff --git a/sam/examples/._00README.txt b/sam/examples/._00README.txt
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._00README.txt and /dev/null differ
diff --git a/sam/examples/._Makefile b/sam/examples/._Makefile
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._Makefile and /dev/null differ
diff --git a/sam/examples/._bam2bed.c b/sam/examples/._bam2bed.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._bam2bed.c and /dev/null differ
diff --git a/sam/examples/._calDepth.c b/sam/examples/._calDepth.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._calDepth.c and /dev/null differ
diff --git a/sam/examples/._chk_indel.c b/sam/examples/._chk_indel.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._chk_indel.c and /dev/null differ
diff --git a/sam/examples/._ex1.fa b/sam/examples/._ex1.fa
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._ex1.fa and /dev/null differ
diff --git a/sam/examples/._ex1.sam.gz b/sam/examples/._ex1.sam.gz
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._ex1.sam.gz and /dev/null differ
diff --git a/sam/examples/._toy.fa b/sam/examples/._toy.fa
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._toy.fa and /dev/null differ
diff --git a/sam/examples/._toy.sam b/sam/examples/._toy.sam
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/examples/._toy.sam and /dev/null differ
diff --git a/sam/examples/00README.txt b/sam/examples/00README.txt
deleted file mode 100644
index dbb276f..0000000
--- a/sam/examples/00README.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-File ex1.fa contains two sequences cut from the human genome
-build36. They were exatracted with command:
-
- samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
-
-Sequence names were changed manually for simplicity. File ex1.sam.gz
-contains MAQ alignments exatracted with:
-
- (samtools view NA18507_maq.bam 2:2044001-2045500;
- samtools view NA18507_maq.bam 20:68001-69500)
-
-and processed with `samtools fixmate' to make it self-consistent as a
-standalone alignment.
-
-To try samtools, you may run the following commands:
-
- samtools faidx ex1.fa # index the reference FASTA
- samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM
- samtools index ex1.bam # index BAM
- samtools tview ex1.bam ex1.fa # view alignment
- samtools pileup -cf ex1.fa ex1.bam # pileup and consensus
- samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
-
diff --git a/sam/examples/Makefile b/sam/examples/Makefile
deleted file mode 100644
index 309399f..0000000
--- a/sam/examples/Makefile
+++ /dev/null
@@ -1,50 +0,0 @@
-all:../libbam.a ../samtools ../bcftools/bcftools \
- ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz ex1.bcf calDepth
- @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo;
-
-ex1.fa.fai:ex1.fa
- ../samtools faidx ex1.fa
-ex1.bam:ex1.sam.gz ex1.fa.fai
- ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam
-ex1.bam.bai:ex1.bam
- ../samtools index ex1.bam
-ex1.pileup.gz:ex1.bam ex1.fa
- ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz
-ex1.glf:ex1.bam ex1.fa
- ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf
-ex1.glfview.gz:ex1.glf
- ../samtools glfview ex1.glf | gzip > ex1.glfview.gz
-ex1a.bam:ex1.bam
- ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"a";print}}' | ../samtools view -bS - > $@
-ex1b.bam:ex1.bam
- ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@
-ex1f.rg:
- (echo "@RG ID:ex1 LB:ex1 SM:ex1"; echo "@RG ID:ex1a LB:ex1 SM:ex1"; echo "@RG ID:ex1b LB:ex1b SM:ex1b") > $@
-ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg
- ../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam
-ex1f-rmduppe.bam:ex1f.bam
- ../samtools rmdup ex1f.bam $@
-ex1f-rmdupse.bam:ex1f.bam
- ../samtools rmdup -S ex1f.bam $@
-
-ex1.bcf:ex1.bam ex1.fa.fai
- ../samtools mpileup -gf ex1.fa ex1.bam > $@
-
-../bcftools/bcftools:
- (cd ../bcftools; make bcftools)
-
-../samtools:
- (cd ..; make samtools)
-
-../libbam.a:
- (cd ..; make libbam.a)
-
-calDepth:../libbam.a calDepth.c
- gcc -g -Wall -O2 -I.. calDepth.c -o $@ -L.. -lbam -lm -lz
-
-clean:
- rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf
-
-# ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"'
-
-# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"'
diff --git a/sam/examples/bam2bed.c b/sam/examples/bam2bed.c
deleted file mode 100644
index bb937d1..0000000
--- a/sam/examples/bam2bed.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <stdio.h>
-#include "sam.h"
-static int fetch_func(const bam1_t *b, void *data)
-{
- samfile_t *fp = (samfile_t*)data;
- uint32_t *cigar = bam1_cigar(b);
- const bam1_core_t *c = &b->core;
- int i, l;
- if (b->core.tid < 0) return 0;
- for (i = l = 0; i < c->n_cigar; ++i) {
- int op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
- l += cigar[i]>>4;
- }
- printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid],
- c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+');
- return 0;
-}
-int main(int argc, char *argv[])
-{
- samfile_t *fp;
- if (argc == 1) {
- fprintf(stderr, "Usage: bam2bed <in.bam> [region]\n");
- return 1;
- }
- if ((fp = samopen(argv[1], "rb", 0)) == 0) {
- fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]);
- return 1;
- }
- if (argc == 2) { /* if a region is not specified */
- bam1_t *b = bam_init1();
- while (samread(fp, b) >= 0) fetch_func(b, fp);
- bam_destroy1(b);
- } else {
- int ref, beg, end;
- bam_index_t *idx;
- if ((idx = bam_index_load(argv[1])) == 0) {
- fprintf(stderr, "bam2bed: BAM indexing file is not available.\n");
- return 1;
- }
- bam_parse_region(fp->header, argv[2], &ref, &beg, &end);
- if (ref < 0) {
- fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]);
- return 1;
- }
- bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func);
- bam_index_destroy(idx);
- }
- samclose(fp);
- return 0;
-}
diff --git a/sam/examples/calDepth.c b/sam/examples/calDepth.c
deleted file mode 100644
index 7a3239c..0000000
--- a/sam/examples/calDepth.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <stdio.h>
-#include "sam.h"
-
-typedef struct {
- int beg, end;
- samfile_t *in;
-} tmpstruct_t;
-
-// callback for bam_fetch()
-static int fetch_func(const bam1_t *b, void *data)
-{
- bam_plbuf_t *buf = (bam_plbuf_t*)data;
- bam_plbuf_push(b, buf);
- return 0;
-}
-// callback for bam_plbuf_init()
-static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- tmpstruct_t *tmp = (tmpstruct_t*)data;
- if ((int)pos >= tmp->beg && (int)pos < tmp->end)
- printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n);
- return 0;
-}
-
-int main(int argc, char *argv[])
-{
- tmpstruct_t tmp;
- if (argc == 1) {
- fprintf(stderr, "Usage: calDepth <in.bam> [region]\n");
- return 1;
- }
- tmp.beg = 0; tmp.end = 0x7fffffff;
- tmp.in = samopen(argv[1], "rb", 0);
- if (tmp.in == 0) {
- fprintf(stderr, "Fail to open BAM file %s\n", argv[1]);
- return 1;
- }
- if (argc == 2) { // if a region is not specified
- sampileup(tmp.in, -1, pileup_func, &tmp);
- } else {
- int ref;
- bam_index_t *idx;
- bam_plbuf_t *buf;
- idx = bam_index_load(argv[1]); // load BAM index
- if (idx == 0) {
- fprintf(stderr, "BAM indexing file is not available.\n");
- return 1;
- }
- bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region
- if (ref < 0) {
- fprintf(stderr, "Invalid region %s\n", argv[2]);
- return 1;
- }
- buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup
- bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func);
- bam_plbuf_push(0, buf); // finalize pileup
- bam_index_destroy(idx);
- bam_plbuf_destroy(buf);
- }
- samclose(tmp.in);
- return 0;
-}
diff --git a/sam/examples/chk_indel.c b/sam/examples/chk_indel.c
deleted file mode 100644
index aaa77e0..0000000
--- a/sam/examples/chk_indel.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* To compile, copy this file to the samtools source code directory and compile with:
- gcc -g -O2 -Wall chk_indel_rg.c -o chk_indel_rg -Wall -I. -L. -lbam -lz
-*/
-
-#include <string.h>
-#include "bam.h"
-
-typedef struct {
- long cnt[4]; // short:ins, short:del, long:ins, long:del
-} rgcnt_t;
-
-#include "khash.h"
-KHASH_MAP_INIT_STR(rgcnt, rgcnt_t)
-
-#define MAX_LEN 127
-#define Q_THRES 10
-#define L_THRES 6 // short: <=L_THRES; otherwise long
-
-int main(int argc, char *argv[])
-{
- bamFile fp;
- bam1_t *b;
- int i, x;
- khash_t(rgcnt) *h;
- khint_t k;
-
- if (argc == 1) {
- fprintf(stderr, "Usage: chk_indel_rg <in.bam>\n\n");
- fprintf(stderr, "Output: filename, RG, #ins-in-short-homopolymer, #del-in-short, #ins-in-long, #del-in-long\n");
- return 1;
- }
-
- h = kh_init(rgcnt);
- fp = bam_open(argv[1], "r");
- bam_header_destroy(bam_header_read(fp)); // we do not need the header
- b = bam_init1();
-
- while (bam_read1(fp, b) >= 0) {
- if (b->core.n_cigar >= 3 && b->core.qual >= Q_THRES) {
- const uint8_t *seq;
- const uint32_t *cigar = bam1_cigar(b);
- char *rg;
- for (i = 0; i < b->core.n_cigar; ++i) // check if there are 1bp indels
- if (bam_cigar_oplen(cigar[i]) == 1 && (bam_cigar_op(cigar[i]) == BAM_CDEL || bam_cigar_op(cigar[i]) == BAM_CINS))
- break;
- if (i == b->core.n_cigar) continue; // no 1bp ins or del
- if ((rg = (char*)bam_aux_get(b, "RG")) == 0) continue; // no RG tag
- seq = bam1_seq(b);
- for (i = x = 0; i < b->core.n_cigar; ++i) {
- int op = bam_cigar_op(cigar[i]);
- if (bam_cigar_oplen(cigar[i]) == 1 && (op == BAM_CDEL || op == BAM_CINS)) {
- int c, j, hrun, which;
- c = bam1_seqi(seq, x);
- for (j = x + 1, hrun = 0; j < b->core.l_qseq; ++j, ++hrun) // calculate the hompolymer run length
- if (bam1_seqi(seq, j) != c) break;
- k = kh_get(rgcnt, h, rg + 1);
- if (k == kh_end(h)) { // absent
- char *key = strdup(rg + 1);
- k = kh_put(rgcnt, h, key, &c);
- memset(&kh_val(h, k), 0, sizeof(rgcnt_t));
- }
- which = (hrun <= L_THRES? 0 : 1)<<1 | (op == BAM_CINS? 0 : 1);
- ++kh_val(h, k).cnt[which];
- }
- if (bam_cigar_type(op)&1) ++x;
- }
- }
- }
-
- for (k = 0; k != kh_end(h); ++k) {
- if (!kh_exist(h, k)) continue;
- printf("%s\t%s", argv[1], kh_key(h, k));
- for (i = 0; i < 4; ++i)
- printf("\t%ld", kh_val(h, k).cnt[i]);
- putchar('\n');
- free((char*)kh_key(h, k));
- }
-
- bam_destroy1(b);
- bam_close(fp);
- kh_destroy(rgcnt, h);
- return 0;
-}
diff --git a/sam/examples/ex1.fa b/sam/examples/ex1.fa
deleted file mode 100644
index ef611b4..0000000
--- a/sam/examples/ex1.fa
+++ /dev/null
@@ -1,56 +0,0 @@
->seq1
-CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT
-GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC
-GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG
-TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC
-AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA
-CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC
-AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT
-CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA
-ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC
-AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC
-AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC
-ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC
-CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT
-TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT
-TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT
-GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT
-ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA
-ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG
-TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA
-CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG
-TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC
-TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC
-TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG
-TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG
-AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA
-TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC
-TCCCTCGTCTTCTTA
->seq2
-TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG
-CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT
-TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT
-CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA
-AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT
-AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC
-ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG
-GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT
-CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT
-TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA
-AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA
-ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT
-TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA
-AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC
-TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA
-GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT
-AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA
-AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT
-AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT
-AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT
-ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT
-GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG
-CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA
-GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA
-AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA
-TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC
-CAGAAAAAAATATTTACAGTAACT
diff --git a/sam/examples/ex1.sam.gz b/sam/examples/ex1.sam.gz
deleted file mode 100644
index 44c07ee..0000000
Binary files a/sam/examples/ex1.sam.gz and /dev/null differ
diff --git a/sam/examples/toy.fa b/sam/examples/toy.fa
deleted file mode 100644
index afe990a..0000000
--- a/sam/examples/toy.fa
+++ /dev/null
@@ -1,4 +0,0 @@
->ref
-AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT
->ref2
-aggttttataaaacaattaagtctacagagcaactacgcg
diff --git a/sam/examples/toy.sam b/sam/examples/toy.sam
deleted file mode 100644
index 33449b1..0000000
--- a/sam/examples/toy.sam
+++ /dev/null
@@ -1,14 +0,0 @@
-@SQ SN:ref LN:45
-@SQ SN:ref2 LN:40
-r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
-r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
-r003 0 ref 9 30 5H6M * 0 0 AGCTAA *
-r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
-r003 16 ref 29 30 6H5M * 0 0 TAGGC *
-r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
-x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
-x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
-x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
-x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
-x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
-x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
diff --git a/sam/faidx.c b/sam/faidx.c
deleted file mode 100644
index 51c82ac..0000000
--- a/sam/faidx.c
+++ /dev/null
@@ -1,437 +0,0 @@
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include "faidx.h"
-#include "khash.h"
-
-typedef struct {
- int32_t line_len, line_blen;
- int64_t len;
- uint64_t offset;
-} faidx1_t;
-KHASH_MAP_INIT_STR(s, faidx1_t)
-
-#ifndef _NO_RAZF
-#include "razf.h"
-#else
-#ifdef _WIN32
-#define ftello(fp) ftell(fp)
-#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
-#else
-extern off_t ftello(FILE *stream);
-extern int fseeko(FILE *stream, off_t offset, int whence);
-#endif
-#define RAZF FILE
-#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
-#define razf_open(fn, mode) fopen(fn, mode)
-#define razf_close(fp) fclose(fp)
-#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
-#define razf_tell(fp) ftello(fp)
-#endif
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-
-struct __faidx_t {
- RAZF *rz;
- int n, m;
- char **name;
- khash_t(s) *hash;
-};
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
-{
- khint_t k;
- int ret;
- faidx1_t t;
- if (idx->n == idx->m) {
- idx->m = idx->m? idx->m<<1 : 16;
- idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
- }
- idx->name[idx->n] = strdup(name);
- k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
- t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
- kh_value(idx->hash, k) = t;
- ++idx->n;
-}
-
-faidx_t *fai_build_core(RAZF *rz)
-{
- char c, *name;
- int l_name, m_name, ret;
- int line_len, line_blen, state;
- int l1, l2;
- faidx_t *idx;
- uint64_t offset;
- int64_t len;
-
- idx = (faidx_t*)calloc(1, sizeof(faidx_t));
- idx->hash = kh_init(s);
- name = 0; l_name = m_name = 0;
- len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
- while (razf_read(rz, &c, 1)) {
- if (c == '\n') { // an empty line
- if (state == 1) {
- offset = razf_tell(rz);
- continue;
- } else if ((state == 0 && len < 0) || state == 2) continue;
- }
- if (c == '>') { // fasta header
- if (len >= 0)
- fai_insert_index(idx, name, len, line_len, line_blen, offset);
- l_name = 0;
- while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
- if (m_name < l_name + 2) {
- m_name = l_name + 2;
- kroundup32(m_name);
- name = (char*)realloc(name, m_name);
- }
- name[l_name++] = c;
- }
- name[l_name] = '\0';
- if (ret == 0) {
- fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
- free(name); fai_destroy(idx);
- return 0;
- }
- if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
- state = 1; len = 0;
- offset = razf_tell(rz);
- } else {
- if (state == 3) {
- fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
- free(name); fai_destroy(idx);
- return 0;
- }
- if (state == 2) state = 3;
- l1 = l2 = 0;
- do {
- ++l1;
- if (isgraph(c)) ++l2;
- } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
- if (state == 3 && l2) {
- fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
- free(name); fai_destroy(idx);
- return 0;
- }
- ++l1; len += l2;
- if (state == 1) line_len = l1, line_blen = l2, state = 0;
- else if (state == 0) {
- if (l1 != line_len || l2 != line_blen) state = 2;
- }
- }
- }
- fai_insert_index(idx, name, len, line_len, line_blen, offset);
- free(name);
- return idx;
-}
-
-void fai_save(const faidx_t *fai, FILE *fp)
-{
- khint_t k;
- int i;
- for (i = 0; i < fai->n; ++i) {
- faidx1_t x;
- k = kh_get(s, fai->hash, fai->name[i]);
- x = kh_value(fai->hash, k);
-#ifdef _WIN32
- fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
-#else
- fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
-#endif
- }
-}
-
-faidx_t *fai_read(FILE *fp)
-{
- faidx_t *fai;
- char *buf, *p;
- int len, line_len, line_blen;
-#ifdef _WIN32
- long offset;
-#else
- long long offset;
-#endif
- fai = (faidx_t*)calloc(1, sizeof(faidx_t));
- fai->hash = kh_init(s);
- buf = (char*)calloc(0x10000, 1);
- while (!feof(fp) && fgets(buf, 0x10000, fp)) {
- for (p = buf; *p && isgraph(*p); ++p);
- *p = 0; ++p;
-#ifdef _WIN32
- sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
-#else
- sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
-#endif
- fai_insert_index(fai, buf, len, line_len, line_blen, offset);
- }
- free(buf);
- return fai;
-}
-
-void fai_destroy(faidx_t *fai)
-{
- int i;
- for (i = 0; i < fai->n; ++i) free(fai->name[i]);
- free(fai->name);
- kh_destroy(s, fai->hash);
- if (fai->rz) razf_close(fai->rz);
- free(fai);
-}
-
-int fai_build(const char *fn)
-{
- char *str;
- RAZF *rz;
- FILE *fp;
- faidx_t *fai;
- str = (char*)calloc(strlen(fn) + 5, 1);
- sprintf(str, "%s.fai", fn);
- rz = razf_open(fn, "r");
- if (rz == 0) {
- fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
- free(str);
- return -1;
- }
- fai = fai_build_core(rz);
- razf_close(rz);
- fp = fopen(str, "wb");
- if (fp == 0) {
- fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
- fai_destroy(fai); free(str);
- return -1;
- }
- fai_save(fai, fp);
- fclose(fp);
- free(str);
- fai_destroy(fai);
- return 0;
-}
-
-#ifdef _USE_KNETFILE
-FILE *download_and_open(const char *fn)
-{
- const int buf_size = 1 * 1024 * 1024;
- uint8_t *buf;
- FILE *fp;
- knetFile *fp_remote;
- const char *url = fn;
- const char *p;
- int l = strlen(fn);
- for (p = fn + l - 1; p >= fn; --p)
- if (*p == '/') break;
- fn = p + 1;
-
- // First try to open a local copy
- fp = fopen(fn, "r");
- if (fp)
- return fp;
-
- // If failed, download from remote and open
- fp_remote = knet_open(url, "rb");
- if (fp_remote == 0) {
- fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
- return NULL;
- }
- if ((fp = fopen(fn, "wb")) == 0) {
- fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
- knet_close(fp_remote);
- return NULL;
- }
- buf = (uint8_t*)calloc(buf_size, 1);
- while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
- fwrite(buf, 1, l, fp);
- free(buf);
- fclose(fp);
- knet_close(fp_remote);
-
- return fopen(fn, "r");
-}
-#endif
-
-faidx_t *fai_load(const char *fn)
-{
- char *str;
- FILE *fp;
- faidx_t *fai;
- str = (char*)calloc(strlen(fn) + 5, 1);
- sprintf(str, "%s.fai", fn);
-
-#ifdef _USE_KNETFILE
- if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
- {
- fp = download_and_open(str);
- if ( !fp )
- {
- fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
- free(str);
- return 0;
- }
- }
- else
-#endif
- fp = fopen(str, "rb");
- if (fp == 0) {
- fprintf(stderr, "[fai_load] build FASTA index.\n");
- fai_build(fn);
- fp = fopen(str, "rb");
- if (fp == 0) {
- fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
- free(str);
- return 0;
- }
- }
-
- fai = fai_read(fp);
- fclose(fp);
-
- fai->rz = razf_open(fn, "rb");
- free(str);
- if (fai->rz == 0) {
- fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
- return 0;
- }
- return fai;
-}
-
-char *fai_fetch(const faidx_t *fai, const char *str, int *len)
-{
- char *s, c;
- int i, l, k, name_end;
- khiter_t iter;
- faidx1_t val;
- khash_t(s) *h;
- int beg, end;
-
- beg = end = -1;
- h = fai->hash;
- name_end = l = strlen(str);
- s = (char*)malloc(l+1);
- // remove space
- for (i = k = 0; i < l; ++i)
- if (!isspace(str[i])) s[k++] = str[i];
- s[k] = 0; l = k;
- // determine the sequence name
- for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
- if (i >= 0) name_end = i;
- if (name_end < l) { // check if this is really the end
- int n_hyphen = 0;
- for (i = name_end + 1; i < l; ++i) {
- if (s[i] == '-') ++n_hyphen;
- else if (!isdigit(s[i]) && s[i] != ',') break;
- }
- if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
- s[name_end] = 0;
- iter = kh_get(s, h, s);
- if (iter == kh_end(h)) { // cannot find the sequence name
- iter = kh_get(s, h, str); // try str as the name
- if (iter == kh_end(h)) {
- *len = 0;
- free(s); return 0;
- } else s[name_end] = ':', name_end = l;
- }
- } else iter = kh_get(s, h, str);
- if(iter == kh_end(h)) {
- fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
- free(s);
- return 0;
- };
- val = kh_value(h, iter);
- // parse the interval
- if (name_end < l) {
- for (i = k = name_end + 1; i < l; ++i)
- if (s[i] != ',') s[k++] = s[i];
- s[k] = 0;
- beg = atoi(s + name_end + 1);
- for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
- end = i < k? atoi(s + i + 1) : val.len;
- if (beg > 0) --beg;
- } else beg = 0, end = val.len;
- if (beg >= val.len) beg = val.len;
- if (end >= val.len) end = val.len;
- if (beg > end) beg = end;
- free(s);
-
- // now retrieve the sequence
- l = 0;
- s = (char*)malloc(end - beg + 2);
- razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
- while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
- if (isgraph(c)) s[l++] = c;
- s[l] = '\0';
- *len = l;
- return s;
-}
-
-int faidx_main(int argc, char *argv[])
-{
- if (argc == 1) {
- fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
- return 1;
- } else {
- if (argc == 2) fai_build(argv[1]);
- else {
- int i, j, k, l;
- char *s;
- faidx_t *fai;
- fai = fai_load(argv[1]);
- if (fai == 0) return 1;
- for (i = 2; i != argc; ++i) {
- printf(">%s\n", argv[i]);
- s = fai_fetch(fai, argv[i], &l);
- for (j = 0; j < l; j += 60) {
- for (k = 0; k < 60 && k < l - j; ++k)
- putchar(s[j + k]);
- putchar('\n');
- }
- free(s);
- }
- fai_destroy(fai);
- }
- }
- return 0;
-}
-
-int faidx_fetch_nseq(const faidx_t *fai)
-{
- return fai->n;
-}
-
-char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
-{
- int l;
- char c;
- khiter_t iter;
- faidx1_t val;
- char *seq=NULL;
-
- // Adjust position
- iter = kh_get(s, fai->hash, c_name);
- if(iter == kh_end(fai->hash)) return 0;
- val = kh_value(fai->hash, iter);
- if(p_end_i < p_beg_i) p_beg_i = p_end_i;
- if(p_beg_i < 0) p_beg_i = 0;
- else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
- if(p_end_i < 0) p_end_i = 0;
- else if(val.len <= p_end_i) p_end_i = val.len - 1;
-
- // Now retrieve the sequence
- l = 0;
- seq = (char*)malloc(p_end_i - p_beg_i + 2);
- razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
- while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
- if (isgraph(c)) seq[l++] = c;
- seq[l] = '\0';
- *len = l;
- return seq;
-}
-
-#ifdef FAIDX_MAIN
-int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
-#endif
diff --git a/sam/faidx.h b/sam/faidx.h
deleted file mode 100644
index 1fb1b1f..0000000
--- a/sam/faidx.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <***@sanger.ac.uk> */
-
-#ifndef FAIDX_H
-#define FAIDX_H
-
-/*!
- @header
-
- Index FASTA files and extract subsequence.
-
- @copyright The Wellcome Trust Sanger Institute.
- */
-
-struct __faidx_t;
-typedef struct __faidx_t faidx_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /*!
- @abstract Build index for a FASTA or razip compressed FASTA file.
- @param fn FASTA file name
- @return 0 on success; or -1 on failure
- @discussion File "fn.fai" will be generated.
- */
- int fai_build(const char *fn);
-
- /*!
- @abstract Distroy a faidx_t struct.
- @param fai Pointer to the struct to be destroyed
- */
- void fai_destroy(faidx_t *fai);
-
- /*!
- @abstract Load index from "fn.fai".
- @param fn File name of the FASTA file
- */
- faidx_t *fai_load(const char *fn);
-
- /*!
- @abstract Fetch the sequence in a region.
- @param fai Pointer to the faidx_t struct
- @param reg Region in the format "chr2:20,000-30,000"
- @param len Length of the region
- @return Pointer to the sequence; null on failure
-
- @discussion The returned sequence is allocated by malloc family
- and should be destroyed by end users by calling free() on it.
- */
- char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
-
- /*!
- @abstract Fetch the number of sequences.
- @param fai Pointer to the faidx_t struct
- @return The number of sequences
- */
- int faidx_fetch_nseq(const faidx_t *fai);
-
- /*!
- @abstract Fetch the sequence in a region.
- @param fai Pointer to the faidx_t struct
- @param c_name Region name
- @param p_beg_i Beginning position number (zero-based)
- @param p_end_i End position number (zero-based)
- @param len Length of the region
- @return Pointer to the sequence; null on failure
-
- @discussion The returned sequence is allocated by malloc family
- and should be destroyed by end users by calling free() on it.
- */
- char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/kaln.c b/sam/kaln.c
deleted file mode 100644
index 9c0bbaa..0000000
--- a/sam/kaln.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2003-2006, 2008, 2009, by Heng Li <***@gmail.com>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kaln.h"
-
-#define FROM_M 0
-#define FROM_I 1
-#define FROM_D 2
-
-typedef struct {
- int i, j;
- unsigned char ctype;
-} path_t;
-
-int aln_sm_blosum62[] = {
-/* A R N D C Q E G H I L K M F P S T W Y V * X */
- 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
- -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
- -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
- -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
- 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
- -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
- -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
- 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
- -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
- -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
- -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
- -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
- -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
- -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
- -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
- 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
- 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
- -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
- -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
- 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
- -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
- 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
-};
-
-int aln_sm_blast[] = {
- 1, -3, -3, -3, -2,
- -3, 1, -3, -3, -2,
- -3, -3, 1, -3, -2,
- -3, -3, -3, 1, -2,
- -2, -2, -2, -2, -2
-};
-
-int aln_sm_qual[] = {
- 0, -23, -23, -23, 0,
- -23, 0, -23, -23, 0,
- -23, -23, 0, -23, 0,
- -23, -23, -23, 0, 0,
- 0, 0, 0, 0, 0
-};
-
-ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
-ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
-
-ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
-
-static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
-{
- int i, n;
- uint32_t *cigar;
- unsigned char last_type;
-
- if (path_len == 0 || path == 0) {
- *n_cigar = 0;
- return 0;
- }
-
- last_type = path->ctype;
- for (i = n = 1; i < path_len; ++i) {
- if (last_type != path[i].ctype) ++n;
- last_type = path[i].ctype;
- }
- *n_cigar = n;
- cigar = (uint32_t*)calloc(*n_cigar, 4);
-
- cigar[0] = 1u << 4 | path[path_len-1].ctype;
- last_type = path[path_len-1].ctype;
- for (i = path_len - 2, n = 0; i >= 0; --i) {
- if (path[i].ctype == last_type) cigar[n] += 1u << 4;
- else {
- cigar[++n] = 1u << 4 | path[i].ctype;
- last_type = path[i].ctype;
- }
- }
-
- return cigar;
-}
-
-/***************************/
-/* START OF common_align.c */
-/***************************/
-
-#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
-
-#define set_M(MM, cur, p, sc) \
-{ \
- if ((p)->M >= (p)->I) { \
- if ((p)->M >= (p)->D) { \
- (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
- } else { \
- (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
- } \
- } else { \
- if ((p)->I > (p)->D) { \
- (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
- } else { \
- (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
- } \
- } \
-}
-#define set_I(II, cur, p) \
-{ \
- if ((p)->M - gap_open > (p)->I) { \
- (cur)->It = FROM_M; \
- (II) = (p)->M - gap_open - gap_ext; \
- } else { \
- (cur)->It = FROM_I; \
- (II) = (p)->I - gap_ext; \
- } \
-}
-#define set_end_I(II, cur, p) \
-{ \
- if (gap_end_ext >= 0) { \
- if ((p)->M - gap_end_open > (p)->I) { \
- (cur)->It = FROM_M; \
- (II) = (p)->M - gap_end_open - gap_end_ext; \
- } else { \
- (cur)->It = FROM_I; \
- (II) = (p)->I - gap_end_ext; \
- } \
- } else set_I(II, cur, p); \
-}
-#define set_D(DD, cur, p) \
-{ \
- if ((p)->M - gap_open > (p)->D) { \
- (cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_open - gap_ext; \
- } else { \
- (cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_ext; \
- } \
-}
-#define set_end_D(DD, cur, p) \
-{ \
- if (gap_end_ext >= 0) { \
- if ((p)->M - gap_end_open > (p)->D) { \
- (cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_end_open - gap_end_ext; \
- } else { \
- (cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_end_ext; \
- } \
- } else set_D(DD, cur, p); \
-}
-
-typedef struct {
- uint8_t Mt:3, It:2, Dt:3;
-} dpcell_t;
-
-typedef struct {
- int M, I, D;
-} dpscore_t;
-
-/***************************
- * banded global alignment *
- ***************************/
-uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
-{
- int i, j;
- dpcell_t **dpcell, *q;
- dpscore_t *curr, *last, *s;
- int b1, b2, tmp_end;
- int *mat, end, max = 0;
- uint8_t type, ctype;
- uint32_t *cigar = 0;
-
- int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
- int *score_matrix, N_MATRIX_ROW;
-
- /* initialize some align-related parameters. just for compatibility */
- gap_open = ap->gap_open;
- gap_ext = ap->gap_ext;
- gap_end_open = ap->gap_end_open;
- gap_end_ext = ap->gap_end_ext;
- b = ap->band_width;
- score_matrix = ap->matrix;
- N_MATRIX_ROW = ap->row;
-
- if (n_cigar) *n_cigar = 0;
- if (len1 == 0 || len2 == 0) return 0;
-
- /* calculate b1 and b2 */
- if (len1 > len2) {
- b1 = len1 - len2 + b;
- b2 = b;
- } else {
- b1 = b;
- b2 = len2 - len1 + b;
- }
- if (b1 > len1) b1 = len1;
- if (b2 > len2) b2 = len2;
- --seq1; --seq2;
-
- /* allocate memory */
- end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
- dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
- for (j = 0; j <= len2; ++j)
- dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
- for (j = b2 + 1; j <= len2; ++j)
- dpcell[j] -= j - b2;
- curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
- last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
-
- /* set first row */
- SET_INF(*curr); curr->M = 0;
- for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
- SET_INF(*s);
- set_end_D(s->D, dpcell[0] + i, s - 1);
- }
- s = curr; curr = last; last = s;
-
- /* core dynamic programming, part 1 */
- tmp_end = (b2 < len2)? b2 : len2 - 1;
- for (j = 1; j <= tmp_end; ++j) {
- q = dpcell[j]; s = curr; SET_INF(*s);
- set_end_I(s->I, q, last);
- end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- ++s; ++q;
- for (i = 1; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_D(s->D, q, s - 1);
- if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
- set_end_I(s->I, q, last + i);
- } else s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- }
- /* last row for part 1, use set_end_D() instead of set_D() */
- if (j == len2 && b2 != len2 - 1) {
- q = dpcell[j]; s = curr; SET_INF(*s);
- set_end_I(s->I, q, last);
- end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- ++s; ++q;
- for (i = 1; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
- set_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_end_D(s->D, q, s - 1);
- if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
- set_end_I(s->I, q, last + i);
- } else s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- ++j;
- }
-
- /* core dynamic programming, part 2 */
- for (; j <= len2 - b2 + 1; ++j) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- end = j + b1 - 1;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_D(s->D, q, s - 1);
- s->I = MINOR_INF;
- s = curr; curr = last; last = s;
- }
-
- /* core dynamic programming, part 3 */
- for (; j < len2; ++j) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
- set_end_I(s->I, q, last + i);
- set_D(s->D, q, s - 1);
- s = curr; curr = last; last = s;
- }
- /* last row */
- if (j == len2) {
- SET_INF(curr[j - b2]);
- mat = score_matrix + seq2[j] * N_MATRIX_ROW;
- for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
- set_M(s->M, q, last + i - 1, mat[seq1[i]]);
- set_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- }
- set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
- set_end_I(s->I, q, last + i);
- set_end_D(s->D, q, s - 1);
- s = curr; curr = last; last = s;
- }
-
- *_score = last[len1].M;
- if (n_cigar) { /* backtrace */
- path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
- i = len1; j = len2;
- q = dpcell[j] + i;
- s = last + len1;
- max = s->M; type = q->Mt; ctype = FROM_M;
- if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
- if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
-
- p = path;
- p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
- ++p;
- do {
- switch (ctype) {
- case FROM_M: --i; --j; break;
- case FROM_I: --j; break;
- case FROM_D: --i; break;
- }
- q = dpcell[j] + i;
- ctype = type;
- switch (type) {
- case FROM_M: type = q->Mt; break;
- case FROM_I: type = q->It; break;
- case FROM_D: type = q->Dt; break;
- }
- p->ctype = ctype; p->i = i; p->j = j;
- ++p;
- } while (i || j);
- cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
- free(path);
- }
-
- /* free memory */
- for (j = b2 + 1; j <= len2; ++j)
- dpcell[j] += j - b2;
- for (j = 0; j <= len2; ++j)
- free(dpcell[j]);
- free(dpcell);
- free(curr); free(last);
-
- return cigar;
-}
-
-typedef struct {
- int M, I, D;
-} score_aux_t;
-
-#define MINUS_INF -0x40000000
-
-// matrix: len2 rows and len1 columns
-int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
-{
-
-#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
- int t1, t2; \
- score_aux_t *_q; \
- _q = _q0; \
- _p->M = _q->M >= _q->I? _q->M : _q->I; \
- _p->M = _p->M >= _q->D? _p->M : _q->D; \
- _p->M += (_sc); \
- ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
- _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
- }
-
- int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
- const uint8_t *seq1, *seq2;
- score_aux_t *curr, *last, *swap;
- bw = abs(len1 - len2) + ap->band_width;
- i = len1 > len2? len1 : len2;
- if (bw > i + 1) bw = i + 1;
- seq1 = _seq1 - 1; seq2 = _seq2 - 1;
- curr = calloc(len1 + 2, sizeof(score_aux_t));
- last = calloc(len1 + 2, sizeof(score_aux_t));
- { // the zero-th row
- int x, end = len1;
- score_aux_t *p;
- j = 0;
- x = j + bw; end = len1 < x? len1 : x; // band end
- p = curr;
- p->M = 0; p->I = p->D = MINUS_INF;
- for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
- p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
- p->M = p->I = p->D = MINUS_INF;
- swap = curr; curr = last; last = swap;
- }
- for (j = 1; j < len2; ++j) {
- int x, beg = 0, end = len1, *scrow, col_end;
- score_aux_t *p;
- x = j - bw; beg = 0 > x? 0 : x; // band start
- x = j + bw; end = len1 < x? len1 : x; // band end
- if (beg == 0) { // from zero-th column
- p = curr;
- p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
- ++beg; // then beg = 1
- }
- scrow = scmat + seq2[j] * scmat_size;
- if (end == len1) col_end = 1, --end;
- else col_end = 0;
- for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
- if (col_end) {
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
- ++p;
- }
- p->M = p->I = p->D = MINUS_INF;
-// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
- swap = curr; curr = last; last = swap;
- }
- { // the last row
- int x, beg = 0, *scrow;
- score_aux_t *p;
- j = len2;
- x = j - bw; beg = 0 > x? 0 : x; // band start
- if (beg == 0) { // from zero-th column
- p = curr;
- p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
- ++beg; // then beg = 1
- }
- scrow = scmat + seq2[j] * scmat_size;
- for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
- __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
-// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
- }
- ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
- ret = ret >= curr[len1].D? ret : curr[len1].D;
- free(curr); free(last);
- return ret;
-}
-
-#ifdef _MAIN
-int main(int argc, char *argv[])
-{
-// int len1 = 35, len2 = 35;
-// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
-// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
- int len1 = 4, len2 = 4;
- uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
- uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
- int sc;
-// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
- sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
- printf("%d\n", sc);
- return 0;
-}
-#endif
diff --git a/sam/kaln.h b/sam/kaln.h
deleted file mode 100644
index 1ece132..0000000
--- a/sam/kaln.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2003-2006, 2008, 2009 by Heng Li <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef LH3_KALN_H_
-#define LH3_KALN_H_
-
-#include <stdint.h>
-
-#define MINOR_INF -1073741823
-
-typedef struct {
- int gap_open;
- int gap_ext;
- int gap_end_open;
- int gap_end_ext;
-
- int *matrix;
- int row;
- int band_width;
-} ka_param_t;
-
-typedef struct {
- int iio, iie, ido, ide;
- int eio, eie, edo, ede;
- int *matrix;
- int row;
- int band_width;
-} ka_param2_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap,
- int *_score, int *n_cigar);
- int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap);
-#ifdef __cplusplus
-}
-#endif
-
-extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */
-extern ka_param_t ka_param_qual; // only use this for global alignment!!!
-extern ka_param2_t ka_param2_qual; // only use this for global alignment!!!
-
-#endif
diff --git a/sam/khash.h b/sam/khash.h
deleted file mode 100644
index a7e8056..0000000
--- a/sam/khash.h
+++ /dev/null
@@ -1,528 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008, 2009, 2011 by Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/*
- An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
- int ret, is_missing;
- khiter_t k;
- khash_t(32) *h = kh_init(32);
- k = kh_put(32, h, 5, &ret);
- if (!ret) kh_del(32, h, k);
- kh_value(h, k) = 10;
- k = kh_get(32, h, 10);
- is_missing = (k == kh_end(h));
- k = kh_get(32, h, 5);
- kh_del(32, h, k);
- for (k = kh_begin(h); k != kh_end(h); ++k)
- if (kh_exist(h, k)) kh_value(h, k) = 1;
- kh_destroy(32, h);
- return 0;
-}
-*/
-
-/*
- 2011-02-14 (0.2.5):
-
- * Allow to declare global functions.
-
- 2009-09-26 (0.2.4):
-
- * Improve portability
-
- 2008-09-19 (0.2.3):
-
- * Corrected the example
- * Improved interfaces
-
- 2008-09-11 (0.2.2):
-
- * Improved speed a little in kh_put()
-
- 2008-09-10 (0.2.1):
-
- * Added kh_clear()
- * Fixed a compiling error
-
- 2008-09-02 (0.2.0):
-
- * Changed to token concatenation which increases flexibility.
-
- 2008-08-31 (0.1.2):
-
- * Fixed a bug in kh_get(), which has not been tested previously.
-
- 2008-08-31 (0.1.1):
-
- * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
- @header
-
- Generic hash table library.
-
- @copyright Heng Li
- */
-
-#define AC_VERSION_KHASH_H "0.2.5"
-
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-/* compipler specific configuration */
-
-#if UINT_MAX == 0xffffffffu
-typedef unsigned int khint32_t;
-#elif ULONG_MAX == 0xffffffffu
-typedef unsigned long khint32_t;
-#endif
-
-#if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khint64_t;
-#else
-typedef unsigned long long khint64_t;
-#endif
-
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-typedef khint32_t khint_t;
-typedef khint_t khiter_t;
-
-#define __ac_HASH_PRIME_SIZE 32
-static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
-{
- 0ul, 3ul, 11ul, 23ul, 53ul,
- 97ul, 193ul, 389ul, 769ul, 1543ul,
- 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
- 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
- 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
- 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
- 3221225473ul, 4294967291ul
-};
-
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define KHASH_DECLARE(name, khkey_t, khval_t) \
- typedef struct { \
- khint_t n_buckets, size, n_occupied, upper_bound; \
- khint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- extern kh_##name##_t *kh_init_##name(); \
- extern void kh_destroy_##name(kh_##name##_t *h); \
- extern void kh_clear_##name(kh_##name##_t *h); \
- extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
- extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
- extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
- extern void kh_del_##name(kh_##name##_t *h, khint_t x);
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- typedef struct { \
- khint_t n_buckets, size, n_occupied, upper_bound; \
- khint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- SCOPE kh_##name##_t *kh_init_##name() { \
- return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
- } \
- SCOPE void kh_destroy_##name(kh_##name##_t *h) \
- { \
- if (h) { \
- free(h->keys); free(h->flags); \
- free(h->vals); \
- free(h); \
- } \
- } \
- SCOPE void kh_clear_##name(kh_##name##_t *h) \
- { \
- if (h && h->flags) { \
- memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
- h->size = h->n_occupied = 0; \
- } \
- } \
- SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
- { \
- if (h->n_buckets) { \
- khint_t inc, k, i, last; \
- k = __hash_func(key); i = k % h->n_buckets; \
- inc = 1 + k % (h->n_buckets - 1); last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
- else i += inc; \
- if (i == last) return h->n_buckets; \
- } \
- return __ac_iseither(h->flags, i)? h->n_buckets : i; \
- } else return 0; \
- } \
- SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
- { \
- khint32_t *new_flags = 0; \
- khint_t j = 1; \
- { \
- khint_t t = __ac_HASH_PRIME_SIZE - 1; \
- while (__ac_prime_list[t] > new_n_buckets) --t; \
- new_n_buckets = __ac_prime_list[t+1]; \
- if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
- else { \
- new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
- memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
- if (h->n_buckets < new_n_buckets) { \
- h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) \
- h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- } \
- } \
- if (j) { \
- for (j = 0; j != h->n_buckets; ++j) { \
- if (__ac_iseither(h->flags, j) == 0) { \
- khkey_t key = h->keys[j]; \
- khval_t val; \
- if (kh_is_map) val = h->vals[j]; \
- __ac_set_isdel_true(h->flags, j); \
- while (1) { \
- khint_t inc, k, i; \
- k = __hash_func(key); \
- i = k % new_n_buckets; \
- inc = 1 + k % (new_n_buckets - 1); \
- while (!__ac_isempty(new_flags, i)) { \
- if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
- else i += inc; \
- } \
- __ac_set_isempty_false(new_flags, i); \
- if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
- { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
- if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
- __ac_set_isdel_true(h->flags, i); \
- } else { \
- h->keys[i] = key; \
- if (kh_is_map) h->vals[i] = val; \
- break; \
- } \
- } \
- } \
- } \
- if (h->n_buckets > new_n_buckets) { \
- h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) \
- h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- free(h->flags); \
- h->flags = new_flags; \
- h->n_buckets = new_n_buckets; \
- h->n_occupied = h->size; \
- h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
- } \
- } \
- SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
- { \
- khint_t x; \
- if (h->n_occupied >= h->upper_bound) { \
- if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
- else kh_resize_##name(h, h->n_buckets + 1); \
- } \
- { \
- khint_t inc, k, i, site, last; \
- x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
- if (__ac_isempty(h->flags, i)) x = i; \
- else { \
- inc = 1 + k % (h->n_buckets - 1); last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (__ac_isdel(h->flags, i)) site = i; \
- if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
- else i += inc; \
- if (i == last) { x = site; break; } \
- } \
- if (x == h->n_buckets) { \
- if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
- else x = i; \
- } \
- } \
- } \
- if (__ac_isempty(h->flags, x)) { \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; ++h->n_occupied; \
- *ret = 1; \
- } else if (__ac_isdel(h->flags, x)) { \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; \
- *ret = 2; \
- } else *ret = 0; \
- return x; \
- } \
- SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
- { \
- if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
- __ac_set_isdel_true(h->flags, x); \
- --h->size; \
- } \
- }
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
- @abstract Integer hash function
- @param key The integer [khint32_t]
- @return The hash value [khint_t]
- */
-#define kh_int_hash_func(key) (khint32_t)(key)
-/*! @function
- @abstract Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract 64-bit integer hash function
- @param key The integer [khint64_t]
- @return The hash value [khint_t]
- */
-#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
-/*! @function
- @abstract 64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract const char* hash function
- @param s Pointer to a null terminated string
- @return The hash value
- */
-static inline khint_t __ac_X31_hash_string(const char *s)
-{
- khint_t h = *s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
- return h;
-}
-/*! @function
- @abstract Another interface to const char* hash function
- @param key Pointer to a null terminated string [const char*]
- @return The hash value [khint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
- @abstract Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other necessary macros... */
-
-/*!
- @abstract Type of the hash table.
- @param name Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
- @abstract Initiate a hash table.
- @param name Name of the hash table [symbol]
- @return Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name()
-
-/*! @function
- @abstract Destroy a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
- @abstract Reset a hash table without deallocating memory.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
- @abstract Resize a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param s New size [khint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
- @abstract Insert a key to the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @param r Extra return code: 0 if the key is present in the hash table;
- 1 if the bucket is empty (never used); 2 if the element in
- the bucket has been deleted [int*]
- @return Iterator to the inserted element [khint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
- @abstract Retrieve a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
- @abstract Remove a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Iterator to the element to be deleted [khint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-
-/*! @function
- @abstract Test whether a bucket contains data.
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return 1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
- @abstract Get key given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
- @abstract Get value given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return Value [type of values]
- @discussion For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Get the start iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The start iterator [khint_t]
- */
-#define kh_begin(h) (khint_t)(0)
-
-/*! @function
- @abstract Get the end iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The end iterator [khint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
- @abstract Get the number of elements in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of elements in the hash table [khint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
- @abstract Get the number of buckets in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of buckets in the hash table [khint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/* More conenient interfaces */
-
-/*! @function
- @abstract Instantiate a hash set containing integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name) \
- KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t) \
- KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT64(name) \
- KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT64(name, khval_t) \
- KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-typedef const char *kh_cstr_t;
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name) \
- KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t) \
- KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-#endif /* __AC_KHASH_H */
diff --git a/sam/klist.h b/sam/klist.h
deleted file mode 100644
index 2f17016..0000000
--- a/sam/klist.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef _LH3_KLIST_H
-#define _LH3_KLIST_H
-
-#include <stdlib.h>
-
-#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \
- typedef struct { \
- size_t cnt, n, max; \
- kmptype_t **buf; \
- } kmp_##name##_t; \
- static inline kmp_##name##_t *kmp_init_##name() { \
- return calloc(1, sizeof(kmp_##name##_t)); \
- } \
- static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \
- size_t k; \
- for (k = 0; k < mp->n; ++k) { \
- kmpfree_f(mp->buf[k]); free(mp->buf[k]); \
- } \
- free(mp->buf); free(mp); \
- } \
- static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
- ++mp->cnt; \
- if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \
- return mp->buf[--mp->n]; \
- } \
- static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
- --mp->cnt; \
- if (mp->n == mp->max) { \
- mp->max = mp->max? mp->max<<1 : 16; \
- mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \
- } \
- mp->buf[mp->n++] = p; \
- }
-
-#define kmempool_t(name) kmp_##name##_t
-#define kmp_init(name) kmp_init_##name()
-#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
-#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
-#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
-
-#define KLIST_INIT(name, kltype_t, kmpfree_t) \
- struct __kl1_##name { \
- kltype_t data; \
- struct __kl1_##name *next; \
- }; \
- typedef struct __kl1_##name kl1_##name; \
- KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \
- typedef struct { \
- kl1_##name *head, *tail; \
- kmp_##name##_t *mp; \
- size_t size; \
- } kl_##name##_t; \
- static inline kl_##name##_t *kl_init_##name() { \
- kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \
- kl->mp = kmp_init(name); \
- kl->head = kl->tail = kmp_alloc(name, kl->mp); \
- kl->head->next = 0; \
- return kl; \
- } \
- static inline void kl_destroy_##name(kl_##name##_t *kl) { \
- kl1_##name *p; \
- for (p = kl->head; p != kl->tail; p = p->next) \
- kmp_free(name, kl->mp, p); \
- kmp_free(name, kl->mp, p); \
- kmp_destroy(name, kl->mp); \
- free(kl); \
- } \
- static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \
- kl1_##name *q, *p = kmp_alloc(name, kl->mp); \
- q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \
- ++kl->size; \
- return &q->data; \
- } \
- static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
- kl1_##name *p; \
- if (kl->head->next == 0) return -1; \
- --kl->size; \
- p = kl->head; kl->head = kl->head->next; \
- if (d) *d = p->data; \
- kmp_free(name, kl->mp, p); \
- return 0; \
- }
-
-#define kliter_t(name) kl1_##name
-#define klist_t(name) kl_##name##_t
-#define kl_val(iter) ((iter)->data)
-#define kl_next(iter) ((iter)->next)
-#define kl_begin(kl) ((kl)->head)
-#define kl_end(kl) ((kl)->tail)
-
-#define kl_init(name) kl_init_##name()
-#define kl_destroy(name, kl) kl_destroy_##name(kl)
-#define kl_pushp(name, kl) kl_pushp_##name(kl)
-#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
-
-#endif
diff --git a/sam/knetfile.c b/sam/knetfile.c
deleted file mode 100644
index af09146..0000000
--- a/sam/knetfile.c
+++ /dev/null
@@ -1,632 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 by Genome Research Ltd (GRL).
- 2010 by Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Probably I will not do socket programming in the next few years and
- therefore I decide to heavily annotate this file, for Linux and
- Windows as well. -ac */
-
-#include <time.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sys/types.h>
-
-#ifndef _WIN32
-#include <netdb.h>
-#include <arpa/inet.h>
-#include <sys/socket.h>
-#endif
-
-#include "knetfile.h"
-
-/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
- * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
- * integer -1. In knetfile.c, I use "int" for socket type
- * throughout. This should be improved to avoid confusion.
- *
- * In Linux/Mac, recv() and read() do almost the same thing. You can see
- * in the header file that netread() is simply an alias of read(). In
- * Windows, however, they are different and using recv() is mandatory.
- */
-
-/* This function tests if the file handler is ready for reading (or
- * writing if is_read==0). */
-static int socket_wait(int fd, int is_read)
-{
- fd_set fds, *fdr = 0, *fdw = 0;
- struct timeval tv;
- int ret;
- tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
- FD_ZERO(&fds);
- FD_SET(fd, &fds);
- if (is_read) fdr = &fds;
- else fdw = &fds;
- ret = select(fd+1, fdr, fdw, 0, &tv);
-#ifndef _WIN32
- if (ret == -1) perror("select");
-#else
- if (ret == 0)
- fprintf(stderr, "select time-out\n");
- else if (ret == SOCKET_ERROR)
- fprintf(stderr, "select: %d\n", WSAGetLastError());
-#endif
- return ret;
-}
-
-#ifndef _WIN32
-/* This function does not work with Windows due to the lack of
- * getaddrinfo() in winsock. It is addapted from an example in "Beej's
- * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
-static int socket_connect(const char *host, const char *port)
-{
-#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
-
- int on = 1, fd;
- struct linger lng = { 0, 0 };
- struct addrinfo hints, *res = 0;
- memset(&hints, 0, sizeof(struct addrinfo));
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_STREAM;
- /* In Unix/Mac, getaddrinfo() is the most convenient way to get
- * server information. */
- if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
- if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
- /* The following two setsockopt() are used by ftplib
- * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
- * necessary. */
- if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
- if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
- if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
- freeaddrinfo(res);
- return fd;
-}
-#else
-/* MinGW's printf has problem with "%lld" */
-char *int64tostr(char *buf, int64_t x)
-{
- int cnt;
- int i = 0;
- do {
- buf[i++] = '0' + x % 10;
- x /= 10;
- } while (x);
- buf[i] = 0;
- for (cnt = i, i = 0; i < cnt/2; ++i) {
- int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
- }
- return buf;
-}
-
-int64_t strtoint64(const char *buf)
-{
- int64_t x;
- for (x = 0; *buf != '\0'; ++buf)
- x = x * 10 + ((int64_t) *buf - 48);
- return x;
-}
-/* In windows, the first thing is to establish the TCP connection. */
-int knet_win32_init()
-{
- WSADATA wsaData;
- return WSAStartup(MAKEWORD(2, 2), &wsaData);
-}
-void knet_win32_destroy()
-{
- WSACleanup();
-}
-/* A slightly modfied version of the following function also works on
- * Mac (and presummably Linux). However, this function is not stable on
- * my Mac. It sometimes works fine but sometimes does not. Therefore for
- * non-Windows OS, I do not use this one. */
-static SOCKET socket_connect(const char *host, const char *port)
-{
-#define __err_connect(func) \
- do { \
- fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
- return -1; \
- } while (0)
-
- int on = 1;
- SOCKET fd;
- struct linger lng = { 0, 0 };
- struct sockaddr_in server;
- struct hostent *hp = 0;
- // open socket
- if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
- if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
- if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
- // get host info
- if (isalpha(host[0])) hp = gethostbyname(host);
- else {
- struct in_addr addr;
- addr.s_addr = inet_addr(host);
- hp = gethostbyaddr((char*)&addr, 4, AF_INET);
- }
- if (hp == 0) __err_connect("gethost");
- // connect
- server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
- server.sin_family= AF_INET;
- server.sin_port = htons(atoi(port));
- if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
- // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
- return fd;
-}
-#endif
-
-static off_t my_netread(int fd, void *buf, off_t len)
-{
- off_t rest = len, curr, l = 0;
- /* recv() and read() may not read the required length of data with
- * one call. They have to be called repeatedly. */
- while (rest) {
- if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
- curr = netread(fd, buf + l, rest);
- /* According to the glibc manual, section 13.2, a zero returned
- * value indicates end-of-file (EOF), which should mean that
- * read() will not return zero if EOF has not been met but data
- * are not immediately available. */
- if (curr == 0) break;
- l += curr; rest -= curr;
- }
- return l;
-}
-
-/*************************
- * FTP specific routines *
- *************************/
-
-static int kftp_get_response(knetFile *ftp)
-{
-#ifndef _WIN32
- unsigned char c;
-#else
- char c;
-#endif
- int n = 0;
- char *p;
- if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
- while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
- //fputc(c, stderr);
- if (n >= ftp->max_response) {
- ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
- ftp->response = realloc(ftp->response, ftp->max_response);
- }
- ftp->response[n++] = c;
- if (c == '\n') {
- if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
- && ftp->response[3] != '-') break;
- n = 0;
- continue;
- }
- }
- if (n < 2) return -1;
- ftp->response[n-2] = 0;
- return strtol(ftp->response, &p, 0);
-}
-
-static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
-{
- if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
- netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
- return is_get? kftp_get_response(ftp) : 0;
-}
-
-static int kftp_pasv_prep(knetFile *ftp)
-{
- char *p;
- int v[6];
- kftp_send_cmd(ftp, "PASV\r\n", 1);
- for (p = ftp->response; *p && *p != '('; ++p);
- if (*p != '(') return -1;
- ++p;
- sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
- memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
- ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
- return 0;
-}
-
-
-static int kftp_pasv_connect(knetFile *ftp)
-{
- char host[80], port[10];
- if (ftp->pasv_port == 0) {
- fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
- return -1;
- }
- sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
- sprintf(port, "%d", ftp->pasv_port);
- ftp->fd = socket_connect(host, port);
- if (ftp->fd == -1) return -1;
- return 0;
-}
-
-int kftp_connect(knetFile *ftp)
-{
- ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
- if (ftp->ctrl_fd == -1) return -1;
- kftp_get_response(ftp);
- kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
- kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
- kftp_send_cmd(ftp, "TYPE I\r\n", 1);
- return 0;
-}
-
-int kftp_reconnect(knetFile *ftp)
-{
- if (ftp->ctrl_fd != -1) {
- netclose(ftp->ctrl_fd);
- ftp->ctrl_fd = -1;
- }
- netclose(ftp->fd);
- ftp->fd = -1;
- return kftp_connect(ftp);
-}
-
-// initialize ->type, ->host, ->retr and ->size
-knetFile *kftp_parse_url(const char *fn, const char *mode)
-{
- knetFile *fp;
- char *p;
- int l;
- if (strstr(fn, "ftp://") != fn) return 0;
- for (p = (char*)fn + 6; *p && *p != '/'; ++p);
- if (*p != '/') return 0;
- l = p - fn - 6;
- fp = calloc(1, sizeof(knetFile));
- fp->type = KNF_TYPE_FTP;
- fp->fd = -1;
- /* the Linux/Mac version of socket_connect() also recognizes a port
- * like "ftp", but the Windows version does not. */
- fp->port = strdup("21");
- fp->host = calloc(l + 1, 1);
- if (strchr(mode, 'c')) fp->no_reconnect = 1;
- strncpy(fp->host, fn + 6, l);
- fp->retr = calloc(strlen(p) + 8, 1);
- sprintf(fp->retr, "RETR %s\r\n", p);
- fp->size_cmd = calloc(strlen(p) + 8, 1);
- sprintf(fp->size_cmd, "SIZE %s\r\n", p);
- fp->seek_offset = 0;
- return fp;
-}
-// place ->fd at offset off
-int kftp_connect_file(knetFile *fp)
-{
- int ret;
- long long file_size;
- if (fp->fd != -1) {
- netclose(fp->fd);
- if (fp->no_reconnect) kftp_get_response(fp);
- }
- kftp_pasv_prep(fp);
- kftp_send_cmd(fp, fp->size_cmd, 1);
-#ifndef _WIN32
- if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
- {
- fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
- return -1;
- }
-#else
- const char *p = fp->response;
- while (*p != ' ') ++p;
- while (*p < '0' || *p > '9') ++p;
- file_size = strtoint64(p);
-#endif
- fp->file_size = file_size;
- if (fp->offset>=0) {
- char tmp[32];
-#ifndef _WIN32
- sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
-#else
- strcpy(tmp, "REST ");
- int64tostr(tmp + 5, fp->offset);
- strcat(tmp, "\r\n");
-#endif
- kftp_send_cmd(fp, tmp, 1);
- }
- kftp_send_cmd(fp, fp->retr, 0);
- kftp_pasv_connect(fp);
- ret = kftp_get_response(fp);
- if (ret != 150) {
- fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
- netclose(fp->fd);
- fp->fd = -1;
- return -1;
- }
- fp->is_ready = 1;
- return 0;
-}
-
-
-/**************************
- * HTTP specific routines *
- **************************/
-
-knetFile *khttp_parse_url(const char *fn, const char *mode)
-{
- knetFile *fp;
- char *p, *proxy, *q;
- int l;
- if (strstr(fn, "http://") != fn) return 0;
- // set ->http_host
- for (p = (char*)fn + 7; *p && *p != '/'; ++p);
- l = p - fn - 7;
- fp = calloc(1, sizeof(knetFile));
- fp->http_host = calloc(l + 1, 1);
- strncpy(fp->http_host, fn + 7, l);
- fp->http_host[l] = 0;
- for (q = fp->http_host; *q && *q != ':'; ++q);
- if (*q == ':') *q++ = 0;
- // get http_proxy
- proxy = getenv("http_proxy");
- // set ->host, ->port and ->path
- if (proxy == 0) {
- fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
- fp->port = strdup(*q? q : "80");
- fp->path = strdup(*p? p : "/");
- } else {
- fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
- for (q = fp->host; *q && *q != ':'; ++q);
- if (*q == ':') *q++ = 0;
- fp->port = strdup(*q? q : "80");
- fp->path = strdup(fn);
- }
- fp->type = KNF_TYPE_HTTP;
- fp->ctrl_fd = fp->fd = -1;
- fp->seek_offset = 0;
- return fp;
-}
-
-int khttp_connect_file(knetFile *fp)
-{
- int ret, l = 0;
- char *buf, *p;
- if (fp->fd != -1) netclose(fp->fd);
- fp->fd = socket_connect(fp->host, fp->port);
- buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
- l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
- l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
- l += sprintf(buf + l, "\r\n");
- netwrite(fp->fd, buf, l);
- l = 0;
- while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
- if (buf[l] == '\n' && l >= 3)
- if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
- ++l;
- }
- buf[l] = 0;
- if (l < 14) { // prematured header
- netclose(fp->fd);
- fp->fd = -1;
- return -1;
- }
- ret = strtol(buf + 8, &p, 0); // HTTP return code
- if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
- off_t rest = fp->offset;
- while (rest) {
- off_t l = rest < 0x10000? rest : 0x10000;
- rest -= my_netread(fp->fd, buf, l);
- }
- } else if (ret != 206 && ret != 200) {
- free(buf);
- fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
- netclose(fp->fd);
- fp->fd = -1;
- return -1;
- }
- free(buf);
- fp->is_ready = 1;
- return 0;
-}
-
-/********************
- * Generic routines *
- ********************/
-
-knetFile *knet_open(const char *fn, const char *mode)
-{
- knetFile *fp = 0;
- if (mode[0] != 'r') {
- fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
- return 0;
- }
- if (strstr(fn, "ftp://") == fn) {
- fp = kftp_parse_url(fn, mode);
- if (fp == 0) return 0;
- if (kftp_connect(fp) == -1) {
- knet_close(fp);
- return 0;
- }
- kftp_connect_file(fp);
- } else if (strstr(fn, "http://") == fn) {
- fp = khttp_parse_url(fn, mode);
- if (fp == 0) return 0;
- khttp_connect_file(fp);
- } else { // local file
-#ifdef _WIN32
- /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
- * be undefined on some systems, although it is defined on my
- * Mac and the Linux I have tested on. */
- int fd = open(fn, O_RDONLY | O_BINARY);
-#else
- int fd = open(fn, O_RDONLY);
-#endif
- if (fd == -1) {
- perror("open");
- return 0;
- }
- fp = (knetFile*)calloc(1, sizeof(knetFile));
- fp->type = KNF_TYPE_LOCAL;
- fp->fd = fd;
- fp->ctrl_fd = -1;
- }
- if (fp && fp->fd == -1) {
- knet_close(fp);
- return 0;
- }
- return fp;
-}
-
-knetFile *knet_dopen(int fd, const char *mode)
-{
- knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
- fp->type = KNF_TYPE_LOCAL;
- fp->fd = fd;
- return fp;
-}
-
-off_t knet_read(knetFile *fp, void *buf, off_t len)
-{
- off_t l = 0;
- if (fp->fd == -1) return 0;
- if (fp->type == KNF_TYPE_FTP) {
- if (fp->is_ready == 0) {
- if (!fp->no_reconnect) kftp_reconnect(fp);
- kftp_connect_file(fp);
- }
- } else if (fp->type == KNF_TYPE_HTTP) {
- if (fp->is_ready == 0)
- khttp_connect_file(fp);
- }
- if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
- off_t rest = len, curr;
- while (rest) {
- do {
- curr = read(fp->fd, buf + l, rest);
- } while (curr < 0 && EINTR == errno);
- if (curr < 0) return -1;
- if (curr == 0) break;
- l += curr; rest -= curr;
- }
- } else l = my_netread(fp->fd, buf, len);
- fp->offset += l;
- return l;
-}
-
-off_t knet_seek(knetFile *fp, int64_t off, int whence)
-{
- if (whence == SEEK_SET && off == fp->offset) return 0;
- if (fp->type == KNF_TYPE_LOCAL) {
- /* Be aware that lseek() returns the offset after seeking,
- * while fseek() returns zero on success. */
- off_t offset = lseek(fp->fd, off, whence);
- if (offset == -1) {
- // Be silent, it is OK for knet_seek to fail when the file is streamed
- // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
- return -1;
- }
- fp->offset = offset;
- return 0;
- }
- else if (fp->type == KNF_TYPE_FTP)
- {
- if (whence==SEEK_CUR)
- fp->offset += off;
- else if (whence==SEEK_SET)
- fp->offset = off;
- else if ( whence==SEEK_END)
- fp->offset = fp->file_size+off;
- fp->is_ready = 0;
- return 0;
- }
- else if (fp->type == KNF_TYPE_HTTP)
- {
- if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
- fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
- errno = ESPIPE;
- return -1;
- }
- if (whence==SEEK_CUR)
- fp->offset += off;
- else if (whence==SEEK_SET)
- fp->offset = off;
- fp->is_ready = 0;
- return 0;
- }
- errno = EINVAL;
- fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
- return -1;
-}
-
-int knet_close(knetFile *fp)
-{
- if (fp == 0) return 0;
- if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
- if (fp->fd != -1) {
- /* On Linux/Mac, netclose() is an alias of close(), but on
- * Windows, it is an alias of closesocket(). */
- if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
- else netclose(fp->fd);
- }
- free(fp->host); free(fp->port);
- free(fp->response); free(fp->retr); // FTP specific
- free(fp->path); free(fp->http_host); // HTTP specific
- free(fp);
- return 0;
-}
-
-#ifdef KNETFILE_MAIN
-int main(void)
-{
- char *buf;
- knetFile *fp;
- int type = 4, l;
-#ifdef _WIN32
- knet_win32_init();
-#endif
- buf = calloc(0x100000, 1);
- if (type == 0) {
- fp = knet_open("knetfile.c", "r");
- knet_seek(fp, 1000, SEEK_SET);
- } else if (type == 1) { // NCBI FTP, large file
- fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
- knet_seek(fp, 2500000000ll, SEEK_SET);
- l = knet_read(fp, buf, 255);
- } else if (type == 2) {
- fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
- knet_seek(fp, 1000, SEEK_SET);
- } else if (type == 3) {
- fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
- knet_seek(fp, 1000, SEEK_SET);
- } else if (type == 4) {
- fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
- knet_read(fp, buf, 10000);
- knet_seek(fp, 20000, SEEK_SET);
- knet_seek(fp, 10000, SEEK_SET);
- l = knet_read(fp, buf+10000, 10000000) + 10000;
- }
- if (type != 4 && type != 1) {
- knet_read(fp, buf, 255);
- buf[255] = 0;
- printf("%s\n", buf);
- } else write(fileno(stdout), buf, l);
- knet_close(fp);
- free(buf);
- return 0;
-}
-#endif
diff --git a/sam/knetfile.h b/sam/knetfile.h
deleted file mode 100644
index 0a0e66f..0000000
--- a/sam/knetfile.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef KNETFILE_H
-#define KNETFILE_H
-
-#include <stdint.h>
-#include <fcntl.h>
-
-#ifndef _WIN32
-#define netread(fd, ptr, len) read(fd, ptr, len)
-#define netwrite(fd, ptr, len) write(fd, ptr, len)
-#define netclose(fd) close(fd)
-#else
-#include <winsock2.h>
-#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
-#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
-#define netclose(fd) closesocket(fd)
-#endif
-
-// FIXME: currently I/O is unbuffered
-
-#define KNF_TYPE_LOCAL 1
-#define KNF_TYPE_FTP 2
-#define KNF_TYPE_HTTP 3
-
-typedef struct knetFile_s {
- int type, fd;
- int64_t offset;
- char *host, *port;
-
- // the following are for FTP only
- int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
- char *response, *retr, *size_cmd;
- int64_t seek_offset; // for lazy seek
- int64_t file_size;
-
- // the following are for HTTP only
- char *path, *http_host;
-} knetFile;
-
-#define knet_tell(fp) ((fp)->offset)
-#define knet_fileno(fp) ((fp)->fd)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _WIN32
- int knet_win32_init();
- void knet_win32_destroy();
-#endif
-
- knetFile *knet_open(const char *fn, const char *mode);
-
- /*
- This only works with local files.
- */
- knetFile *knet_dopen(int fd, const char *mode);
-
- /*
- If ->is_ready==0, this routine updates ->fd; otherwise, it simply
- reads from ->fd.
- */
- off_t knet_read(knetFile *fp, void *buf, off_t len);
-
- /*
- This routine only sets ->offset and ->is_ready=0. It does not
- communicate with the FTP server.
- */
- off_t knet_seek(knetFile *fp, int64_t off, int whence);
- int knet_close(knetFile *fp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/kprobaln.c b/sam/kprobaln.c
deleted file mode 100644
index 04e526a..0000000
--- a/sam/kprobaln.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2003-2006, 2008-2010, by Heng Li <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kprobaln.h"
-
-/*****************************************
- * Probabilistic banded glocal alignment *
- *****************************************/
-
-#define EI .25
-#define EM .33333333333
-
-static float g_qual2prob[256];
-
-#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
-
-kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
-kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
-
-/*
- The topology of the profile HMM:
-
- /\ /\ /\ /\
- I[1] I[k-1] I[k] I[L]
- ^ \ \ ^ \ ^ \ \ ^
- | \ \ | \ | \ \ |
- M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
- \ \/ \/ \/ /
- \ /\ /\ /\ /
- -> D[k-1] -> D[k] ->
-
- M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
-
- On input, _ref is the reference sequence and _query is the query
- sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
- ambiguous residue. iqual is the base quality. c sets the gap open
- probability, gap extension probability and band width.
-
- On output, state and q are arrays of length l_query. The higher 30
- bits give the reference position the query base is matched to and the
- lower two bits can be 0 (an alignment match) or 1 (an
- insertion). q[i] gives the phred scaled posterior probability of
- state[i] being wrong.
- */
-int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q)
-{
- double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
- float *qual, *_qual;
- const uint8_t *ref, *query;
- int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
-
- if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
-
- /*** initialization ***/
- is_backward = state && q? 1 : 0;
- ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
- bw = l_ref > l_query? l_ref : l_query;
- if (bw > c->bw) bw = c->bw;
- if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
- bw2 = bw * 2 + 1;
- // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
- f = calloc(l_query+1, sizeof(void*));
- if (is_backward) b = calloc(l_query+1, sizeof(void*));
- for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
- f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
- if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
- }
- s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
- // initialize qual
- _qual = calloc(l_query, sizeof(float));
- if (g_qual2prob[0] == 0)
- for (i = 0; i < 256; ++i)
- g_qual2prob[i] = pow(10, -i/10.);
- for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
- qual = _qual - 1;
- // initialize transition probability
- sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
- m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
- m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
- m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
- bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
- /*** forward ***/
- // f[0]
- set_u(k, bw, 0, 0);
- f[0][k] = s[0] = 1.;
- { // f[1]
- double *fi = f[1], sum;
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- fi[u+0] = e * bM; fi[u+1] = EI * bI;
- sum += fi[u] + fi[u+1];
- }
- // rescale
- s[1] = sum;
- set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
- for (k = _beg; k <= _end; ++k) fi[k] /= sum;
- }
- // f[2..l_query]
- for (i = 2; i <= l_query; ++i) {
- double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
- int beg = 1, end = l_ref, x, _beg, _end;
- uint8_t qyi = query[i];
- x = i - bw; beg = beg > x? beg : x; // band start
- x = i + bw; end = end < x? end : x; // band end
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u, v11, v01, v10;
- double e;
- e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
- set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
- fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
- fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
- fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
- sum += fi[u] + fi[u+1] + fi[u+2];
-// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
- }
- // rescale
- s[i] = sum;
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
- }
- { // f[l_query+1]
- double sum;
- for (k = 1, sum = 0.; k <= l_ref; ++k) {
- int u;
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
- }
- s[l_query+1] = sum; // the last scaling factor
- }
- { // compute likelihood
- double p = 1., Pr1 = 0.;
- for (i = 0; i <= l_query + 1; ++i) {
- p *= s[i];
- if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
- }
- Pr1 += -4.343 * log(p * l_ref * l_query);
- Pr = (int)(Pr1 + .499);
- if (!is_backward) { // skip backward and MAP
- for (i = 0; i <= l_query; ++i) free(f[i]);
- free(f); free(s); free(_qual);
- return Pr;
- }
- }
- /*** backward ***/
- // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
- for (k = 1; k <= l_ref; ++k) {
- int u;
- double *bi = b[l_query];
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
- }
- // b[l_query-1..1]
- for (i = l_query - 1; i >= 1; --i) {
- int beg = 1, end = l_ref, x, _beg, _end;
- double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
- uint8_t qyi1 = query[i+1];
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = end; k >= beg; --k) {
- int u, v11, v01, v10;
- double e;
- set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
- e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
- bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
- bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
- bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
-// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
- }
- // rescale
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
- }
- { // b[0]
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
- double sum = 0.;
- for (k = end; k >= beg; --k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
- }
- set_u(k, bw, 0, 0);
- pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
- }
- is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
- /*** MAP ***/
- for (i = 1; i <= l_query; ++i) {
- double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
- int beg = 1, end = l_ref, x, max_k = -1;
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = beg; k <= end; ++k) {
- int u;
- double z;
- set_u(u, bw, i, k);
- z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
- z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
- }
- max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
- if (state) state[i-1] = max_k;
- if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
-#ifdef _MAIN
- fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
- "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
-#endif
- }
- /*** free ***/
- for (i = 0; i <= l_query; ++i) {
- free(f[i]); free(b[i]);
- }
- free(f); free(b); free(s); free(_qual);
- return Pr;
-}
-
-#ifdef _MAIN
-#include <unistd.h>
-int main(int argc, char *argv[])
-{
- uint8_t conv[256], *iqual, *ref, *query;
- int c, l_ref, l_query, i, q = 30, b = 10, P;
- while ((c = getopt(argc, argv, "b:q:")) >= 0) {
- switch (c) {
- case 'b': b = atoi(optarg); break;
- case 'q': q = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
- return 1;
- }
- memset(conv, 4, 256);
- conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
- conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
- ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
- l_ref = strlen((char*)ref); l_query = strlen((char*)query);
- for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
- for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
- iqual = malloc(l_query);
- memset(iqual, q, l_query);
- kpa_par_def.bw = b;
- P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
- fprintf(stderr, "%d\n", P);
- free(iqual);
- return 0;
-}
-#endif
diff --git a/sam/kprobaln.h b/sam/kprobaln.h
deleted file mode 100644
index 0357dcc..0000000
--- a/sam/kprobaln.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2003-2006, 2008, 2009 by Heng Li <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef LH3_KPROBALN_H_
-#define LH3_KPROBALN_H_
-
-#include <stdint.h>
-
-typedef struct {
- float d, e;
- int bw;
-} kpa_par_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q);
-
-#ifdef __cplusplus
-}
-#endif
-
-extern kpa_par_t kpa_par_def, kpa_par_alt;
-
-#endif
diff --git a/sam/kseq.h b/sam/kseq.h
deleted file mode 100644
index a5cec7c..0000000
--- a/sam/kseq.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008, 2009, 2011 Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Last Modified: 05MAR2012 */
-
-#ifndef AC_KSEQ_H
-#define AC_KSEQ_H
-
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-
-#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
-#define KS_SEP_TAB 1 // isspace() && !' '
-#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
-#define KS_SEP_MAX 2
-
-#define __KS_TYPE(type_t) \
- typedef struct __kstream_t { \
- unsigned char *buf; \
- int begin, end, is_eof; \
- type_t f; \
- } kstream_t;
-
-#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
-#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
-
-#define __KS_BASIC(type_t, __bufsize) \
- static inline kstream_t *ks_init(type_t f) \
- { \
- kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
- ks->f = f; \
- ks->buf = (unsigned char*)malloc(__bufsize); \
- return ks; \
- } \
- static inline void ks_destroy(kstream_t *ks) \
- { \
- if (ks) { \
- free(ks->buf); \
- free(ks); \
- } \
- }
-
-#define __KS_GETC(__read, __bufsize) \
- static inline int ks_getc(kstream_t *ks) \
- { \
- if (ks->is_eof && ks->begin >= ks->end) return -1; \
- if (ks->begin >= ks->end) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
- } \
- return (int)ks->buf[ks->begin++]; \
- }
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
- size_t l, m;
- char *s;
-} kstring_t;
-#endif
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
- { \
- if (dret) *dret = 0; \
- str->l = append? str->l : 0; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
- for (;;) { \
- int i; \
- if (ks->begin >= ks->end) { \
- if (!ks->is_eof) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
- } else break; \
- } \
- if (delimiter == KS_SEP_LINE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == '\n') break; \
- } else if (delimiter > KS_SEP_MAX) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == delimiter) break; \
- } else if (delimiter == KS_SEP_SPACE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i])) break; \
- } else if (delimiter == KS_SEP_TAB) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
- } else i = 0; /* never come to here! */ \
- if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
- str->m = str->l + (i - ks->begin) + 1; \
- kroundup32(str->m); \
- str->s = (char*)realloc(str->s, str->m); \
- } \
- memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
- str->l = str->l + (i - ks->begin); \
- ks->begin = i + 1; \
- if (i < ks->end) { \
- if (dret) *dret = ks->buf[i]; \
- break; \
- } \
- } \
- if (str->s == 0) { \
- str->m = 1; \
- str->s = (char*)calloc(1, 1); \
- } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
- str->s[str->l] = '\0'; \
- return str->l; \
- } \
- static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
- { return ks_getuntil2(ks, delimiter, str, dret, 0); }
-
-#define KSTREAM_INIT(type_t, __read, __bufsize) \
- __KS_TYPE(type_t) \
- __KS_BASIC(type_t, __bufsize) \
- __KS_GETC(__read, __bufsize) \
- __KS_GETUNTIL(__read, __bufsize)
-
-#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
-
-#define __KSEQ_BASIC(SCOPE, type_t) \
- SCOPE kseq_t *kseq_init(type_t fd) \
- { \
- kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
- s->f = ks_init(fd); \
- return s; \
- } \
- SCOPE void kseq_destroy(kseq_t *ks) \
- { \
- if (!ks) return; \
- free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
- ks_destroy(ks->f); \
- free(ks); \
- }
-
-/* Return value:
- >=0 length of the sequence (normal)
- -1 end-of-file
- -2 truncated quality string
- */
-#define __KSEQ_READ(SCOPE) \
- SCOPE int kseq_read(kseq_t *seq) \
- { \
- int c; \
- kstream_t *ks = seq->f; \
- if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
- if (c == -1) return -1; /* end of file */ \
- seq->last_char = c; \
- } /* else: the first header char has been read in the previous call */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
- if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
- if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
- seq->seq.m = 256; \
- seq->seq.s = (char*)malloc(seq->seq.m); \
- } \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
- if (c == '\n') continue; /* skip empty lines */ \
- seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
- ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
- } \
- if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
- while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* error: no quality string */ \
- while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
- seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
- return seq->seq.l; \
- }
-
-#define __KSEQ_TYPE(type_t) \
- typedef struct { \
- kstring_t name, comment, seq, qual; \
- int last_char; \
- kstream_t *f; \
- } kseq_t;
-
-#define KSEQ_INIT2(SCOPE, type_t, __read) \
- KSTREAM_INIT(type_t, __read, 16384) \
- __KSEQ_TYPE(type_t) \
- __KSEQ_BASIC(SCOPE, type_t) \
- __KSEQ_READ(SCOPE)
-
-#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
-
-#define KSEQ_DECLARE(type_t) \
- __KS_TYPE(type_t) \
- __KSEQ_TYPE(type_t) \
- extern kseq_t *kseq_init(type_t fd); \
- void kseq_destroy(kseq_t *ks); \
- int kseq_read(kseq_t *seq);
-
-#endif
diff --git a/sam/ksort.h b/sam/ksort.h
deleted file mode 100644
index aa0bb93..0000000
--- a/sam/ksort.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Contact: Heng Li <***@sanger.ac.uk> */
-
-/*
- 2012-12-11 (0.1.4):
-
- * Defined __ks_insertsort_##name as static to compile with C99.
-
- 2008-11-16 (0.1.4):
-
- * Fixed a bug in introsort() that happens in rare cases.
-
- 2008-11-05 (0.1.3):
-
- * Fixed a bug in introsort() for complex comparisons.
-
- * Fixed a bug in mergesort(). The previous version is not stable.
-
- 2008-09-15 (0.1.2):
-
- * Accelerated introsort. On my Mac (not on another Linux machine),
- my implementation is as fast as std::sort on random input.
-
- * Added combsort and in introsort, switch to combsort if the
- recursion is too deep.
-
- 2008-09-13 (0.1.1):
-
- * Added k-small algorithm
-
- 2008-09-05 (0.1.0):
-
- * Initial version
-
-*/
-
-#ifndef AC_KSORT_H
-#define AC_KSORT_H
-
-#include <stdlib.h>
-#include <string.h>
-
-typedef struct {
- void *left, *right;
- int depth;
-} ks_isort_stack_t;
-
-#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
-
-#define KSORT_INIT(name, type_t, __sort_lt) \
- void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
- { \
- type_t *a2[2], *a, *b; \
- int curr, shift; \
- \
- a2[0] = array; \
- a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
- for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
- a = a2[curr]; b = a2[1-curr]; \
- if (shift == 0) { \
- type_t *p = b, *i, *eb = a + n; \
- for (i = a; i < eb; i += 2) { \
- if (i == eb - 1) *p++ = *i; \
- else { \
- if (__sort_lt(*(i+1), *i)) { \
- *p++ = *(i+1); *p++ = *i; \
- } else { \
- *p++ = *i; *p++ = *(i+1); \
- } \
- } \
- } \
- } else { \
- size_t i, step = 1ul<<shift; \
- for (i = 0; i < n; i += step<<1) { \
- type_t *p, *j, *k, *ea, *eb; \
- if (n < i + step) { \
- ea = a + n; eb = a; \
- } else { \
- ea = a + i + step; \
- eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
- } \
- j = a + i; k = a + i + step; p = b + i; \
- while (j < ea && k < eb) { \
- if (__sort_lt(*k, *j)) *p++ = *k++; \
- else *p++ = *j++; \
- } \
- while (j < ea) *p++ = *j++; \
- while (k < eb) *p++ = *k++; \
- } \
- } \
- curr = 1 - curr; \
- } \
- if (curr == 1) { \
- type_t *p = a2[0], *i = a2[1], *eb = array + n; \
- for (; p < eb; ++i) *p++ = *i; \
- } \
- if (temp == 0) free(a2[1]); \
- } \
- void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
- { \
- size_t k = i; \
- type_t tmp = l[i]; \
- while ((k = (k << 1) + 1) < n) { \
- if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
- if (__sort_lt(l[k], tmp)) break; \
- l[i] = l[k]; i = k; \
- } \
- l[i] = tmp; \
- } \
- void ks_heapmake_##name(size_t lsize, type_t l[]) \
- { \
- size_t i; \
- for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
- ks_heapadjust_##name(i, lsize, l); \
- } \
- void ks_heapsort_##name(size_t lsize, type_t l[]) \
- { \
- size_t i; \
- for (i = lsize - 1; i > 0; --i) { \
- type_t tmp; \
- tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
- } \
- } \
- static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
- { \
- type_t *i, *j, swap_tmp; \
- for (i = s + 1; i < t; ++i) \
- for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
- swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
- } \
- } \
- void ks_combsort_##name(size_t n, type_t a[]) \
- { \
- const double shrink_factor = 1.2473309501039786540366528676643; \
- int do_swap; \
- size_t gap = n; \
- type_t tmp, *i, *j; \
- do { \
- if (gap > 2) { \
- gap = (size_t)(gap / shrink_factor); \
- if (gap == 9 || gap == 10) gap = 11; \
- } \
- do_swap = 0; \
- for (i = a; i < a + n - gap; ++i) { \
- j = i + gap; \
- if (__sort_lt(*j, *i)) { \
- tmp = *i; *i = *j; *j = tmp; \
- do_swap = 1; \
- } \
- } \
- } while (do_swap || gap > 2); \
- if (gap != 1) __ks_insertsort_##name(a, a + n); \
- } \
- void ks_introsort_##name(size_t n, type_t a[]) \
- { \
- int d; \
- ks_isort_stack_t *top, *stack; \
- type_t rp, swap_tmp; \
- type_t *s, *t, *i, *j, *k; \
- \
- if (n < 1) return; \
- else if (n == 2) { \
- if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
- return; \
- } \
- for (d = 2; 1ul<<d < n; ++d); \
- stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
- top = stack; s = a; t = a + (n-1); d <<= 1; \
- while (1) { \
- if (s < t) { \
- if (--d == 0) { \
- ks_combsort_##name(t - s + 1, s); \
- t = s; \
- continue; \
- } \
- i = s; j = t; k = i + ((j-i)>>1) + 1; \
- if (__sort_lt(*k, *i)) { \
- if (__sort_lt(*k, *j)) k = j; \
- } else k = __sort_lt(*j, *i)? i : j; \
- rp = *k; \
- if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
- for (;;) { \
- do ++i; while (__sort_lt(*i, rp)); \
- do --j; while (i <= j && __sort_lt(rp, *j)); \
- if (j <= i) break; \
- swap_tmp = *i; *i = *j; *j = swap_tmp; \
- } \
- swap_tmp = *i; *i = *t; *t = swap_tmp; \
- if (i-s > t-i) { \
- if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
- s = t-i > 16? i+1 : t; \
- } else { \
- if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
- t = i-s > 16? i-1 : s; \
- } \
- } else { \
- if (top == stack) { \
- free(stack); \
- __ks_insertsort_##name(a, a+n); \
- return; \
- } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
- } \
- } \
- } \
- /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
- /* 0 <= kk < n */ \
- type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
- { \
- type_t *low, *high, *k, *ll, *hh, *mid; \
- low = arr; high = arr + n - 1; k = arr + kk; \
- for (;;) { \
- if (high <= low) return *k; \
- if (high == low + 1) { \
- if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
- return *k; \
- } \
- mid = low + (high - low) / 2; \
- if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
- if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
- if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
- KSORT_SWAP(type_t, *mid, *(low+1)); \
- ll = low + 1; hh = high; \
- for (;;) { \
- do ++ll; while (__sort_lt(*ll, *low)); \
- do --hh; while (__sort_lt(*low, *hh)); \
- if (hh < ll) break; \
- KSORT_SWAP(type_t, *ll, *hh); \
- } \
- KSORT_SWAP(type_t, *low, *hh); \
- if (hh <= k) low = ll; \
- if (hh >= k) high = hh - 1; \
- } \
- } \
- void ks_shuffle_##name(size_t n, type_t a[]) \
- { \
- int i, j; \
- for (i = n; i > 1; --i) { \
- type_t tmp; \
- j = (int)(drand48() * i); \
- tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
- } \
- }
-
-#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
-#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
-#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
-#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
-#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
-#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
-#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
-#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
-
-#define ks_lt_generic(a, b) ((a) < (b))
-#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
-
-typedef const char *ksstr_t;
-
-#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
-#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
-
-#endif
diff --git a/sam/kstring.c b/sam/kstring.c
deleted file mode 100644
index b8ff45c..0000000
--- a/sam/kstring.c
+++ /dev/null
@@ -1,212 +0,0 @@
-#include <stdarg.h>
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdint.h>
-#include "kstring.h"
-
-int ksprintf(kstring_t *s, const char *fmt, ...)
-{
- va_list ap;
- int l;
- va_start(ap, fmt);
- l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
- va_end(ap);
- if (l + 1 > s->m - s->l) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- va_start(ap, fmt);
- l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
- }
- va_end(ap);
- s->l += l;
- return l;
-}
-
-char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
-{
- const char *p, *start;
- if (sep) { // set up the table
- if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
- aux->finished = 0;
- if (sep[1]) {
- aux->sep = -1;
- aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
- for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
- } else aux->sep = sep[0];
- }
- if (aux->finished) return 0;
- else if (str) aux->p = str - 1, aux->finished = 0;
- if (aux->sep < 0) {
- for (p = start = aux->p + 1; *p; ++p)
- if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
- } else {
- for (p = start = aux->p + 1; *p; ++p)
- if (*p == aux->sep) break;
- }
- aux->p = p; // end of token
- if (*p == 0) aux->finished = 1; // no more tokens
- return (char*)start;
-}
-
-// s MUST BE a null terminated string; l = strlen(s)
-int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
-{
- int i, n, max, last_char, last_start, *offsets, l;
- n = 0; max = *_max; offsets = *_offsets;
- l = strlen(s);
-
-#define __ksplit_aux do { \
- if (_offsets) { \
- s[i] = 0; \
- if (n == max) { \
- max = max? max<<1 : 2; \
- offsets = (int*)realloc(offsets, sizeof(int) * max); \
- } \
- offsets[n++] = last_start; \
- } else ++n; \
- } while (0)
-
- for (i = 0, last_char = last_start = 0; i <= l; ++i) {
- if (delimiter == 0) {
- if (isspace(s[i]) || s[i] == 0) {
- if (isgraph(last_char)) __ksplit_aux; // the end of a field
- } else {
- if (isspace(last_char) || last_char == 0) last_start = i;
- }
- } else {
- if (s[i] == delimiter || s[i] == 0) {
- if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
- } else {
- if (last_char == delimiter || last_char == 0) last_start = i;
- }
- }
- last_char = s[i];
- }
- *_max = max; *_offsets = offsets;
- return n;
-}
-
-/**********************
- * Boyer-Moore search *
- **********************/
-
-typedef unsigned char ubyte_t;
-
-// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-static int *ksBM_prep(const ubyte_t *pat, int m)
-{
- int i, *suff, *prep, *bmGs, *bmBc;
- prep = (int*)calloc(m + 256, sizeof(int));
- bmGs = prep; bmBc = prep + m;
- { // preBmBc()
- for (i = 0; i < 256; ++i) bmBc[i] = m;
- for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
- }
- suff = (int*)calloc(m, sizeof(int));
- { // suffixes()
- int f = 0, g;
- suff[m - 1] = m;
- g = m - 1;
- for (i = m - 2; i >= 0; --i) {
- if (i > g && suff[i + m - 1 - f] < i - g)
- suff[i] = suff[i + m - 1 - f];
- else {
- if (i < g) g = i;
- f = i;
- while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
- suff[i] = f - g;
- }
- }
- }
- { // preBmGs()
- int j = 0;
- for (i = 0; i < m; ++i) bmGs[i] = m;
- for (i = m - 1; i >= 0; --i)
- if (suff[i] == i + 1)
- for (; j < m - 1 - i; ++j)
- if (bmGs[j] == m)
- bmGs[j] = m - 1 - i;
- for (i = 0; i <= m - 2; ++i)
- bmGs[m - 1 - suff[i]] = m - 1 - i;
- }
- free(suff);
- return prep;
-}
-
-void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
-{
- int i, j, *prep = 0, *bmGs, *bmBc;
- const ubyte_t *str, *pat;
- str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
- prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
- if (_prep && *_prep == 0) *_prep = prep;
- bmGs = prep; bmBc = prep + m;
- j = 0;
- while (j <= n - m) {
- for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
- if (i >= 0) {
- int max = bmBc[str[i+j]] - m + 1 + i;
- if (max < bmGs[i]) max = bmGs[i];
- j += max;
- } else return (void*)(str + j);
- }
- if (_prep == 0) free(prep);
- return 0;
-}
-
-char *kstrstr(const char *str, const char *pat, int **_prep)
-{
- return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
-}
-
-char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
-{
- return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
-}
-
-/***********************
- * The main() function *
- ***********************/
-
-#ifdef KSTRING_MAIN
-#include <stdio.h>
-int main()
-{
- kstring_t *s;
- int *fields, n, i;
- ks_tokaux_t aux;
- char *p;
- s = (kstring_t*)calloc(1, sizeof(kstring_t));
- // test ksprintf()
- ksprintf(s, " abcdefg: %d ", 100);
- printf("'%s'\n", s->s);
- // test ksplit()
- fields = ksplit(s, 0, &n);
- for (i = 0; i < n; ++i)
- printf("field[%d] = '%s'\n", i, s->s + fields[i]);
- // test kstrtok()
- s->l = 0;
- for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
- kputsn(p, aux.p - p, s);
- kputc('\n', s);
- }
- printf("%s", s->s);
- // free
- free(s->s); free(s); free(fields);
-
- {
- static char *str = "abcdefgcdgcagtcakcdcd";
- static char *pat = "cd";
- char *ret, *s = str;
- int *prep = 0;
- while ((ret = kstrstr(s, pat, &prep)) != 0) {
- printf("match: %s\n", ret);
- s = ret + prep[0];
- }
- free(prep);
- }
- return 0;
-}
-#endif
diff --git a/sam/kstring.h b/sam/kstring.h
deleted file mode 100644
index abd8236..0000000
--- a/sam/kstring.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* The MIT License
-
- Copyright (c) by Attractive Chaos <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef KSTRING_H
-#define KSTRING_H
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
- size_t l, m;
- char *s;
-} kstring_t;
-#endif
-
-typedef struct {
- uint64_t tab[4];
- int sep, finished;
- const char *p; // end of the current token
-} ks_tokaux_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- int ksprintf(kstring_t *s, const char *fmt, ...);
- int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
- char *kstrstr(const char *str, const char *pat, int **_prep);
- char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
- void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
-
- /* kstrtok() is similar to strtok_r() except that str is not
- * modified and both str and sep can be NULL. For efficiency, it is
- * actually recommended to set both to NULL in the subsequent calls
- * if sep is not changed. */
- char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
-
-#ifdef __cplusplus
-}
-#endif
-
-static inline void ks_resize(kstring_t *s, size_t size)
-{
- if (s->m < size) {
- s->m = size;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
-}
-
-static inline int kputsn(const char *p, int l, kstring_t *s)
-{
- if (s->l + l + 1 >= s->m) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- memcpy(s->s + s->l, p, l);
- s->l += l;
- s->s[s->l] = 0;
- return l;
-}
-
-static inline int kputs(const char *p, kstring_t *s)
-{
- return kputsn(p, strlen(p), s);
-}
-
-static inline int kputc(int c, kstring_t *s)
-{
- if (s->l + 1 >= s->m) {
- s->m = s->l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- s->s[s->l++] = c;
- s->s[s->l] = 0;
- return c;
-}
-
-static inline int kputw(int c, kstring_t *s)
-{
- char buf[16];
- int l, x;
- if (c == 0) return kputc('0', s);
- if(c < 0) for (l = 0, x = c; x < 0; x /= 10) buf[l++] = '0' - (x%10);
- else for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
- if (c < 0) buf[l++] = '-';
- if (s->l + l + 1 >= s->m) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
- s->s[s->l] = 0;
- return 0;
-}
-
-static inline int kputuw(unsigned c, kstring_t *s)
-{
- char buf[16];
- int l, i;
- unsigned x;
- if (c == 0) return kputc('0', s);
- for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
- if (s->l + l + 1 >= s->m) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
- s->s[s->l] = 0;
- return 0;
-}
-
-static inline int kputl(long c, kstring_t *s)
-{
- char buf[32];
- long l, x;
- if (c == 0) return kputc('0', s);
- for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
- if (c < 0) buf[l++] = '-';
- if (s->l + l + 1 >= s->m) {
- s->m = s->l + l + 2;
- kroundup32(s->m);
- s->s = (char*)realloc(s->s, s->m);
- }
- for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
- s->s[s->l] = 0;
- return 0;
-}
-
-static inline int *ksplit(kstring_t *s, int delimiter, int *n)
-{
- int max = 0, *offsets = 0;
- *n = ksplit_core(s->s, delimiter, &max, &offsets);
- return offsets;
-}
-
-#endif
diff --git a/sam/misc/._HmmGlocal.java b/sam/misc/._HmmGlocal.java
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._HmmGlocal.java and /dev/null differ
diff --git a/sam/misc/._Makefile b/sam/misc/._Makefile
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._Makefile and /dev/null differ
diff --git a/sam/misc/._ace2sam.c b/sam/misc/._ace2sam.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._ace2sam.c and /dev/null differ
diff --git a/sam/misc/._bamcheck.c b/sam/misc/._bamcheck.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._bamcheck.c and /dev/null differ
diff --git a/sam/misc/._blast2sam.pl b/sam/misc/._blast2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._blast2sam.pl and /dev/null differ
diff --git a/sam/misc/._bowtie2sam.pl b/sam/misc/._bowtie2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._bowtie2sam.pl and /dev/null differ
diff --git a/sam/misc/._export2sam.pl b/sam/misc/._export2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._export2sam.pl and /dev/null differ
diff --git a/sam/misc/._interpolate_sam.pl b/sam/misc/._interpolate_sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._interpolate_sam.pl and /dev/null differ
diff --git a/sam/misc/._maq2sam.c b/sam/misc/._maq2sam.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._maq2sam.c and /dev/null differ
diff --git a/sam/misc/._md5.c b/sam/misc/._md5.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._md5.c and /dev/null differ
diff --git a/sam/misc/._md5.h b/sam/misc/._md5.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._md5.h and /dev/null differ
diff --git a/sam/misc/._md5fa.c b/sam/misc/._md5fa.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._md5fa.c and /dev/null differ
diff --git a/sam/misc/._novo2sam.pl b/sam/misc/._novo2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._novo2sam.pl and /dev/null differ
diff --git a/sam/misc/._plot-bamcheck b/sam/misc/._plot-bamcheck
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._plot-bamcheck and /dev/null differ
diff --git a/sam/misc/._psl2sam.pl b/sam/misc/._psl2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._psl2sam.pl and /dev/null differ
diff --git a/sam/misc/._r2plot.lua b/sam/misc/._r2plot.lua
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._r2plot.lua and /dev/null differ
diff --git a/sam/misc/._sam2vcf.pl b/sam/misc/._sam2vcf.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._sam2vcf.pl and /dev/null differ
diff --git a/sam/misc/._samtools.pl b/sam/misc/._samtools.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._samtools.pl and /dev/null differ
diff --git a/sam/misc/._soap2sam.pl b/sam/misc/._soap2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._soap2sam.pl and /dev/null differ
diff --git a/sam/misc/._varfilter.py b/sam/misc/._varfilter.py
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._varfilter.py and /dev/null differ
diff --git a/sam/misc/._vcfutils.lua b/sam/misc/._vcfutils.lua
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._vcfutils.lua and /dev/null differ
diff --git a/sam/misc/._wgsim.c b/sam/misc/._wgsim.c
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/misc/._wgsim.c and /dev/null differ
diff --git a/sam/misc/._wgsim_eval.pl b/sam/misc/._wgsim_eval.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._wgsim_eval.pl and /dev/null differ
diff --git a/sam/misc/._zoom2sam.pl b/sam/misc/._zoom2sam.pl
deleted file mode 100755
index 94286bb..0000000
Binary files a/sam/misc/._zoom2sam.pl and /dev/null differ
diff --git a/sam/misc/HmmGlocal.java b/sam/misc/HmmGlocal.java
deleted file mode 100644
index 9e93b13..0000000
--- a/sam/misc/HmmGlocal.java
+++ /dev/null
@@ -1,178 +0,0 @@
-import java.io.*;
-import java.lang.*;
-
-public class HmmGlocal
-{
- private double[] qual2prob;
- private double cd, ce; // gap open probility [1e-3], gap extension probability [0.1]
- private int cb; // band width [7]
-
- public HmmGlocal(final double d, final double e, final int b) {
- cd = d; ce = e; cb = b;
- qual2prob = new double[256];
- for (int i = 0; i < 256; ++i)
- qual2prob[i] = Math.pow(10, -i/10.);
- }
- private static int set_u(final int b, final int i, final int k) {
- int x = i - b;
- x = x > 0? x : 0;
- return (k + 1 - x) * 3;
- }
- public int hmm_glocal(final byte[] _ref, final byte[] _query, final byte[] _iqual, int[] state, byte[] q) {
- int i, k;
- /*** initialization ***/
- // change coordinates
- int l_ref = _ref.length;
- byte[] ref = new byte[l_ref+1];
- for (i = 0; i < l_ref; ++i) ref[i+1] = _ref[i]; // FIXME: this is silly...
- int l_query = _query.length;
- byte[] query = new byte[l_query+1];
- double[] qual = new double[l_query+1];
- for (i = 0; i < l_query; ++i) {
- query[i+1] = _query[i];
- qual[i+1] = qual2prob[_iqual[i]];
- }
- // set band width
- int bw2, bw = l_ref > l_query? l_ref : l_query;
- if (bw > cb) bw = cb;
- if (bw < Math.abs(l_ref - l_query)) bw = Math.abs(l_ref - l_query);
- bw2 = bw * 2 + 1;
- // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
- double[][] f = new double[l_query+1][bw2*3 + 6];
- double[][] b = new double[l_query+1][bw2*3 + 6];
- double[] s = new double[l_query+2];
- // initialize transition probabilities
- double sM, sI, bM, bI;
- sM = sI = 1. / (2 * l_query + 2);
- bM = (1 - cd) / l_query; bI = cd / l_query; // (bM+bI)*l_query==1
- double[] m = new double[9];
- m[0*3+0] = (1 - cd - cd) * (1 - sM); m[0*3+1] = m[0*3+2] = cd * (1 - sM);
- m[1*3+0] = (1 - ce) * (1 - sI); m[1*3+1] = ce * (1 - sI); m[1*3+2] = 0.;
- m[2*3+0] = 1 - ce; m[2*3+1] = 0.; m[2*3+2] = ce;
- /*** forward ***/
- // f[0]
- f[0][set_u(bw, 0, 0)] = s[0] = 1.;
- { // f[1]
- double[] fi = f[1];
- double sum;
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
- u = set_u(bw, 1, k);
- fi[u+0] = e * bM; fi[u+1] = .25 * bI;
- sum += fi[u] + fi[u+1];
- }
- // rescale
- s[1] = sum;
- _beg = set_u(bw, 1, beg); _end = set_u(bw, 1, end); _end += 2;
- for (k = _beg; k <= _end; ++k) fi[k] /= sum;
- }
- // f[2..l_query]
- for (i = 2; i <= l_query; ++i) {
- double[] fi = f[i], fi1 = f[i-1];
- double sum, qli = qual[i];
- int beg = 1, end = l_ref, x, _beg, _end;
- byte qyi = query[i];
- x = i - bw; beg = beg > x? beg : x; // band start
- x = i + bw; end = end < x? end : x; // band end
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u, v11, v01, v10;
- double e;
- e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli / 3.;
- u = set_u(bw, i, k); v11 = set_u(bw, i-1, k-1); v10 = set_u(bw, i-1, k); v01 = set_u(bw, i, k-1);
- fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
- fi[u+1] = .25 * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
- fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
- sum += fi[u] + fi[u+1] + fi[u+2];
- //System.out.println("("+i+","+k+";"+u+"): "+fi[u]+","+fi[u+1]+","+fi[u+2]);
- }
- // rescale
- s[i] = sum;
- _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
- for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
- }
- { // f[l_query+1]
- double sum;
- for (k = 1, sum = 0.; k <= l_ref; ++k) {
- int u = set_u(bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
- }
- s[l_query+1] = sum; // the last scaling factor
- }
- /*** backward ***/
- // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
- for (k = 1; k <= l_ref; ++k) {
- int u = set_u(bw, l_query, k);
- double[] bi = b[l_query];
- if (u < 3 || u >= bw2*3+3) continue;
- bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
- }
- // b[l_query-1..1]
- for (i = l_query - 1; i >= 1; --i) {
- int beg = 1, end = l_ref, x, _beg, _end;
- double[] bi = b[i], bi1 = b[i+1];
- double y = (i > 1)? 1. : 0., qli1 = qual[i+1];
- byte qyi1 = query[i+1];
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = end; k >= beg; --k) {
- int u, v11, v01, v10;
- double e;
- u = set_u(bw, i, k); v11 = set_u(bw, i+1, k+1); v10 = set_u(bw, i+1, k); v01 = set_u(bw, i, k+1);
- e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 / 3.) * bi1[v11];
- bi[u+0] = e * m[0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
- bi[u+1] = e * m[3] + .25 * m[4] * bi1[v10+1];
- bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
- }
- // rescale
- _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
- for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
- }
- double pb;
- { // b[0]
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
- double sum = 0.;
- for (k = end; k >= beg; --k) {
- int u = set_u(bw, 1, k);
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
- if (u < 3 || u >= bw2*3+3) continue;
- sum += e * b[1][u+0] * bM + .25 * b[1][u+1] * bI;
- }
- pb = b[0][set_u(bw, 0, 0)] = sum / s[0]; // if everything works as is expected, pb == 1.0
- }
- int is_diff = Math.abs(pb - 1.) > 1e-7? 1 : 0;
- /*** MAP ***/
- for (i = 1; i <= l_query; ++i) {
- double sum = 0., max = 0.;
- double[] fi = f[i], bi = b[i];
- int beg = 1, end = l_ref, x, max_k = -1;
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = beg; k <= end; ++k) {
- int u = set_u(bw, i, k);
- double z;
- sum += (z = fi[u+0] * bi[u+0]); if (z > max) { max = z; max_k = (k-1)<<2 | 0; }
- sum += (z = fi[u+1] * bi[u+1]); if (z > max) { max = z; max_k = (k-1)<<2 | 1; }
- }
- max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
- if (state != null) state[i-1] = max_k;
- if (q != null) {
- k = (int)(-4.343 * Math.log(1. - max) + .499);
- q[i-1] = (byte)(k > 100? 99 : k);
- }
- //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
- }
- return 0;
- }
-
- public static void main(String[] args) {
- byte[] ref = {'\0', '\1', '\3', '\3', '\1'};
- byte[] query = {'\0', '\3', '\3', '\1'};
- byte[] qual = new byte[4];
- qual[0] = qual[1] = qual[2] = qual[3] = (byte)20;
- HmmGlocal hg = new HmmGlocal(1e-3, 0.1, 7);
- hg.hmm_glocal(ref, query, qual, null, null);
- }
-}
\ No newline at end of file
diff --git a/sam/misc/Makefile b/sam/misc/Makefile
deleted file mode 100644
index d36e7ac..0000000
--- a/sam/misc/Makefile
+++ /dev/null
@@ -1,69 +0,0 @@
-CC= gcc
-CXX= g++
-CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
-CXXFLAGS= $(CFLAGS)
-DFLAGS= -D_FILE_OFFSET_BITS=64
-OBJS=
-PROG= md5sum-lite md5fa maq2sam-short maq2sam-long ace2sam wgsim bamcheck
-INCLUDES= -I..
-SUBDIRS= .
-
-.SUFFIXES:.c .o
-
-.c.o:
- $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
-
-all:$(PROG)
-
-lib-recur all-recur clean-recur cleanlocal-recur install-recur:
- @target=`echo $@ | sed s/-recur//`; \
- wdir=`pwd`; \
- list='$(SUBDIRS)'; for subdir in $$list; do \
- cd $$subdir; \
- $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
- INCLUDES="$(INCLUDES)" $$target || exit 1; \
- cd $$wdir; \
- done;
-
-lib:
-
-bamcheck:bamcheck.o
- $(CC) $(CFLAGS) -o $@ bamcheck.o -L.. -lm -lbam -lpthread -lz
-
-bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h
- $(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c
-
-ace2sam:ace2sam.o
- $(CC) $(CFLAGS) -o $@ ace2sam.o -lz
-
-wgsim:wgsim.o
- $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz
-
-md5fa:md5.o md5fa.o md5.h ../kseq.h
- $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
-
-md5sum-lite:md5sum-lite.o
- $(CC) $(CFLAGS) -o $@ md5sum-lite.o
-
-md5sum-lite.o:md5.c md5.h
- $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c
-
-maq2sam-short:maq2sam.c
- $(CC) $(CFLAGS) -o $@ maq2sam.c -lz
-
-maq2sam-long:maq2sam.c
- $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz
-
-md5fa.o:md5.h md5fa.c
- $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
-
-wgsim.o:wgsim.c ../kseq.h
- $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c
-
-ace2sam.o:ace2sam.c ../kstring.h ../kseq.h
- $(CC) $(CFLAGS) -c -I.. -o $@ ace2sam.c
-
-cleanlocal:
- rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a
-
-clean:cleanlocal-recur
diff --git a/sam/misc/ace2sam.c b/sam/misc/ace2sam.c
deleted file mode 100644
index 325133d..0000000
--- a/sam/misc/ace2sam.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2011 Heng Li <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <zlib.h>
-#include "kstring.h"
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 16384)
-
-#define N_TMPSTR 5
-#define LINE_LEN 60
-
-// append a CIGAR operation plus length
-#define write_cigar(_c, _n, _m, _v) do { \
- if (_n == _m) { \
- _m = _m? _m<<1 : 4; \
- _c = realloc(_c, _m * sizeof(unsigned)); \
- } \
- _c[_n++] = (_v); \
- } while (0)
-
-// a fatal error
-static void fatal(const char *msg)
-{
- fprintf(stderr, "E %s\n", msg);
- exit(1);
-}
-// remove pads
-static void remove_pads(const kstring_t *src, kstring_t *dst)
-{
- int i, j;
- dst->l = 0;
- kputsn(src->s, src->l, dst);
- for (i = j = 0; i < dst->l; ++i)
- if (dst->s[i] != '*') dst->s[j++] = dst->s[i];
- dst->s[j] = 0;
- dst->l = j;
-}
-
-int main(int argc, char *argv[])
-{
- gzFile fp;
- kstream_t *ks;
- kstring_t s, t[N_TMPSTR];
- int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
- long m_cigar = 0, n_cigar = 0;
- unsigned *af, *cigar = 0;
-
- while ((c = getopt(argc, argv, "pc")) >= 0) {
- switch (c) {
- case 'p': is_padded = 1; break;
- case 'c': write_cns = 1; break;
- }
- }
- if (argc == optind) {
- fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
- fprintf(stderr, "Options: -p output padded SAM\n");
- fprintf(stderr, " -c write the contig sequence in SAM\n\n");
- fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
- fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n");
- fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
- fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n");
- return 1;
- }
-
- s.l = s.m = 0; s.s = 0;
- af_n = af_max = af_i = 0; af = 0;
- for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
- fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
- if (strcmp(s.s, "CO") == 0) { // contig sequence
- kstring_t *cns;
- t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
- af_n = af_i = 0; // reset the af array
- ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
- ks_getuntil(ks, '\n', &s, &dret); // read the whole line
- while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
- remove_pads(&t[1], &t[2]); // construct the unpadded sequence
- // compute the array for mapping padded positions to unpadded positions
- p2u = realloc(p2u, t[1].m * sizeof(int));
- for (i = k = 0; i < t[1].l; ++i) {
- p2u[i] = k;
- if (t[1].s[i] != '*') ++k;
- }
- // write out the SAM header and contig sequences
- fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
- cns = &t[is_padded?1:2];
- fprintf(stderr, "S >%s\n", t[0].s);
- for (i = 0; i < cns->l; i += LINE_LEN) {
- fputs("S ", stderr);
- for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
- fputc(cns->s[i + k], stderr);
- fputc('\n', stderr);
- }
-
-#define __padded2cigar(sp) do { \
- int i, l_M = 0, l_D = 0; \
- for (i = 0; i < sp.l; ++i) { \
- if (sp.s[i] == '*') { \
- if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
- ++l_D; l_M = 0; \
- } else { \
- if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
- ++l_M; l_D = 0; \
- } \
- } \
- if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
- else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
- } while (0)
-
- if (write_cns) { // write the consensus SAM line (dummy read)
- n_cigar = 0;
- if (is_padded) __padded2cigar(t[1]);
- else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
- kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
- for (i = 0; i < n_cigar; ++i) {
- kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
- }
- kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
- }
- } else if (strcmp(s.s, "BQ") == 0) { // contig quality
- if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'");
- if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line
- if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
- for (i = 0; i < t[2].l; ++i) { // read the consensus quality
- int q;
- if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
- if (s.l) {
- q = atoi(s.s) + 33;
- if (q > 126) q = 126;
- if (write_cns) kputc(q, &t[4]);
- } else --i;
- }
- if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
- ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
- if (write_cns) puts(t[4].s); t[4].l = 0;
- } else if (strcmp(s.s, "AF") == 0) { // padded read position
- int reversed, neg, pos;
- if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
- if (write_cns) {
- if (t[4].l) puts(t[4].s);
- t[4].l = 0;
- }
- ks_getuntil(ks, 0, &s, &dret); // read name
- ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand
- ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position
- if (af_n == af_max) { // double the af array
- af_max = af_max? af_max<<1 : 4;
- af = realloc(af, af_max * sizeof(unsigned));
- }
- af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information
- } else if (strcmp(s.s, "RD") == 0) { // read sequence
- if (af_i >= af_n) fatal("more 'RD' records than 'AF'");
- t[2].l = t[3].l = t[4].l = 0;
- ks_getuntil(ks, 0, &t[4], &dret); // QNAME
- if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line
- while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence
- } else if (strcmp(s.s, "QA") == 0) { // clipping
- if (af_i >= af_n) fatal("more 'QA' records than 'AF'");
- int beg, end, pos, op;
- ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
- ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
- ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end
- if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
- // compute 1-based POS
- pos = af[af_i]>>2; // retrieve the position information
- if (af[af_i]>>1&1) pos = -pos;
- pos += beg; // now pos is the true padded position
- // generate CIGAR
- remove_pads(&t[2], &t[3]); // backup the unpadded read sequence
- n_cigar = 0;
- if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
- if (is_padded) {
- __padded2cigar(t[2]);
- if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR
- if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR
- } else {
- // generate flattened CIGAR string
- for (i = beg, k = pos - 1; i < end; ++i, ++k)
- t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6);
- // generate the proper CIGAR
- for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) {
- if (op != t[2].s[i]) {
- write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
- op = t[2].s[i]; k = 1;
- } else ++k;
- }
- write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
- // remove unnecessary "P" and possibly merge adjacent operations
- for (i = 2; i < n_cigar; ++i) {
- if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) {
- cigar[i-1] = 0;
- if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations
- cigar[i] += cigar[i-2], cigar[i-2] = 0;
- }
- }
- for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations
- if (cigar[i]) cigar[k++] = cigar[i];
- n_cigar = k;
- }
- if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4);
- // write the SAM line for the read
- kputc('\t', &t[4]); // QNAME has already been written
- kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG
- kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME
- kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS
- kputs("\t60\t", &t[4]); // MAPQ
- for (i = 0; i < n_cigar; ++i) { // CIGAR
- kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
- }
- kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
- kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
- kputs("\t*", &t[4]); // QUAL
- puts(t[4].s); // print to stdout
- ++af_i;
- } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
- }
- ks_destroy(ks);
- gzclose(fp);
- free(af); free(s.s); free(cigar); free(p2u);
- for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
- return 0;
-}
diff --git a/sam/misc/bamcheck.c b/sam/misc/bamcheck.c
deleted file mode 100644
index 352db21..0000000
--- a/sam/misc/bamcheck.c
+++ /dev/null
@@ -1,1521 +0,0 @@
-/*
- Author: ***@sanger
- gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
-
- Assumptions, approximations and other issues:
- - GC-depth graph does not split reads, the starting position determines which bin is incremented.
- There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
- - coverage distribution ignores softclips and deletions
- - some stats require sorted BAMs
- - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
- - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
- - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
- counts, no splicing is done, no indels or soft clips are considered, even small overlap is
- good enough to include the read in the stats.
-
-*/
-
-#define BAMCHECK_VERSION "2012-09-04"
-
-#define _ISOC99_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <math.h>
-#include <ctype.h>
-#include <getopt.h>
-#include <errno.h>
-#include <assert.h>
-#include "faidx.h"
-#include "khash.h"
-#include "sam.h"
-#include "sam_header.h"
-#include "razf.h"
-
-#define BWA_MIN_RDLEN 35
-#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
-#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
-#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
-#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
-#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
-#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
-#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
-
-typedef struct
-{
- int32_t line_len, line_blen;
- int64_t len;
- uint64_t offset;
-}
-faidx1_t;
-KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
-KHASH_MAP_INIT_STR(kh_bam_tid, int)
-KHASH_MAP_INIT_STR(kh_rg, const char *)
-struct __faidx_t {
- RAZF *rz;
- int n, m;
- char **name;
- khash_t(kh_faidx) *hash;
-};
-
-typedef struct
-{
- float gc;
- uint32_t depth;
-}
-gc_depth_t;
-
-// For coverage distribution, a simple pileup
-typedef struct
-{
- int64_t pos;
- int size, start;
- int *buffer;
-}
-round_buffer_t;
-
-typedef struct { uint32_t from, to; } pos_t;
-typedef struct
-{
- int npos,mpos,cpos;
- pos_t *pos;
-}
-regions_t;
-
-typedef struct
-{
- // Parameters
- int trim_qual; // bwa trim quality
-
- // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
- // insert size histogram holder
- int nquals; // The number of quality bins
- int nbases; // The maximum sequence length the allocated array can hold
- int nisize; // The maximum insert size that the allocated array can hold
- int ngc; // The size of gc_1st and gc_2nd
- int nindels; // The maximum indel length for indel distribution
-
- // Arrays for the histogram data
- uint64_t *quals_1st, *quals_2nd;
- uint64_t *gc_1st, *gc_2nd;
- uint64_t *isize_inward, *isize_outward, *isize_other;
- uint64_t *acgt_cycles;
- uint64_t *read_lengths;
- uint64_t *insertions, *deletions;
- uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
-
- // The extremes encountered
- int max_len; // Maximum read length
- int max_qual; // Maximum quality
- float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
- int is_sorted;
-
- // Summary numbers
- uint64_t total_len;
- uint64_t total_len_dup;
- uint64_t nreads_1st;
- uint64_t nreads_2nd;
- uint64_t nreads_filtered;
- uint64_t nreads_dup;
- uint64_t nreads_unmapped;
- uint64_t nreads_unpaired;
- uint64_t nreads_paired;
- uint64_t nreads_anomalous;
- uint64_t nreads_mq0;
- uint64_t nbases_mapped;
- uint64_t nbases_mapped_cigar;
- uint64_t nbases_trimmed; // bwa trimmed bases
- uint64_t nmismatches;
- uint64_t nreads_QCfailed, nreads_secondary;
-
- // GC-depth related data
- uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
- gc_depth_t *gcd; // The GC-depth bins holder
- int gcd_bin_size; // The size of GC-depth bin
- uint32_t gcd_ref_size; // The approximate size of the genome
- int32_t tid, gcd_pos; // Position of the current bin
- int32_t pos; // Position of the last read
-
- // Coverage distribution related data
- int ncov; // The number of coverage bins
- uint64_t *cov; // The coverage frequencies
- int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
- round_buffer_t cov_rbuf; // Pileup round buffer
-
- // Mismatches by read cycle
- uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
- int mrseq_buf; // The size of the buffer
- int32_t rseq_pos; // The coordinate of the first base in the buffer
- int32_t nrseq_buf; // The used part of the buffer
- uint64_t *mpc_buf; // Mismatches per cycle
-
- // Filters
- int filter_readlen;
-
- // Target regions
- int nregions, reg_from,reg_to;
- regions_t *regions;
-
- // Auxiliary data
- int flag_require, flag_filter;
- double sum_qual; // For calculating average quality value
- samfile_t *sam;
- khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
- faidx_t *fai; // Reference sequence for GC-depth graph
- int argc; // Command line arguments to be printed on the output
- char **argv;
-}
-stats_t;
-
-void error(const char *format, ...);
-void bam_init_header_hash(bam_header_t *header);
-int is_in_regions(bam1_t *bam_line, stats_t *stats);
-
-
-// Coverage distribution methods
-inline int coverage_idx(int min, int max, int n, int step, int depth)
-{
- if ( depth < min )
- return 0;
-
- if ( depth > max )
- return n-1;
-
- return 1 + (depth - min) / step;
-}
-
-inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
-{
- return (offset + (pos-refpos) % size) % size;
-}
-
-void round_buffer_flush(stats_t *stats, int64_t pos)
-{
- int ibuf,idp;
-
- if ( pos==stats->cov_rbuf.pos )
- return;
-
- int64_t new_pos = pos;
- if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
- {
- // Flush the whole buffer, but in sequential order,
- pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
- }
-
- if ( pos < stats->cov_rbuf.pos )
- error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
-
- int ifrom = stats->cov_rbuf.start;
- int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
- if ( ifrom>ito )
- {
- for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
- {
- if ( !stats->cov_rbuf.buffer[ibuf] )
- continue;
- idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
- stats->cov[idp]++;
- stats->cov_rbuf.buffer[ibuf] = 0;
- }
- ifrom = 0;
- }
- for (ibuf=ifrom; ibuf<=ito; ibuf++)
- {
- if ( !stats->cov_rbuf.buffer[ibuf] )
- continue;
- idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
- stats->cov[idp]++;
- stats->cov_rbuf.buffer[ibuf] = 0;
- }
- stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
- stats->cov_rbuf.pos = new_pos;
-}
-
-void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
-{
- if ( to-from >= rbuf->size )
- error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
- if ( from < rbuf->pos )
- error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
-
- int ifrom,ito,ibuf;
- ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
- ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
- if ( ifrom>ito )
- {
- for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
- rbuf->buffer[ibuf]++;
- ifrom = 0;
- }
- for (ibuf=ifrom; ibuf<=ito; ibuf++)
- rbuf->buffer[ibuf]++;
-}
-
-// Calculate the number of bases in the read trimmed by BWA
-int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
-{
- if ( len<BWA_MIN_RDLEN ) return 0;
-
- // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
- // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
- int max_trimmed = len - BWA_MIN_RDLEN + 1;
- int l, sum=0, max_sum=0, max_l=0;
-
- for (l=0; l<max_trimmed; l++)
- {
- sum += trim_qual - quals[ reverse ? l : len-1-l ];
- if ( sum<0 ) break;
- if ( sum>max_sum )
- {
- max_sum = sum;
- // This is the correct way, but bwa clips from some reason one base less
- // max_l = l+1;
- max_l = l;
- }
- }
- return max_l;
-}
-
-
-void count_indels(stats_t *stats,bam1_t *bam_line)
-{
- int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
- int is_1st = IS_READ1(bam_line) ? 1 : 0;
- int icig;
- int icycle = 0;
- int read_len = bam_line->core.l_qseq;
- for (icig=0; icig<bam_line->core.n_cigar; icig++)
- {
- // Conversion from uint32_t to MIDNSHP
- // 0123456
- // MIDNSHP
- int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
- int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
-
- if ( cig==1 )
- {
- int idx = is_fwd ? icycle : read_len-icycle-ncig;
- if ( idx<0 )
- error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
- if ( is_1st )
- stats->ins_cycles_1st[idx]++;
- else
- stats->ins_cycles_2nd[idx]++;
- icycle += ncig;
- if ( ncig<=stats->nindels )
- stats->insertions[ncig-1]++;
- continue;
- }
- if ( cig==2 )
- {
- int idx = is_fwd ? icycle-1 : read_len-icycle-1;
- if ( idx<0 ) continue; // discard meaningless deletions
- if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
- if ( is_1st )
- stats->del_cycles_1st[idx]++;
- else
- stats->del_cycles_2nd[idx]++;
- if ( ncig<=stats->nindels )
- stats->deletions[ncig-1]++;
- continue;
- }
- if ( cig!=3 && cig!=5 )
- icycle += ncig;
- }
-}
-
-void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
-{
- int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
- int icig,iread=0,icycle=0;
- int iref = bam_line->core.pos - stats->rseq_pos;
- int read_len = bam_line->core.l_qseq;
- uint8_t *read = bam1_seq(bam_line);
- uint8_t *quals = bam1_qual(bam_line);
- uint64_t *mpc_buf = stats->mpc_buf;
- for (icig=0; icig<bam_line->core.n_cigar; icig++)
- {
- // Conversion from uint32_t to MIDNSHP
- // 0123456
- // MIDNSHP
- int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
- int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
- if ( cig==1 )
- {
- iread += ncig;
- icycle += ncig;
- continue;
- }
- if ( cig==2 )
- {
- iref += ncig;
- continue;
- }
- if ( cig==4 )
- {
- icycle += ncig;
- // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
- // iref += ncig;
- iread += ncig;
- continue;
- }
- if ( cig==5 )
- {
- icycle += ncig;
- continue;
- }
- // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
- // chunk of refseq in memory. Not very frequent and not noticable in the stats.
- if ( cig==3 || cig==5 ) continue;
- if ( cig!=0 )
- error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
-
- if ( ncig+iref > stats->nrseq_buf )
- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
-
- int im;
- for (im=0; im<ncig; im++)
- {
- uint8_t cread = bam1_seqi(read,iread);
- uint8_t cref = stats->rseq_buf[iref];
-
- // ---------------15
- // =ACMGRSVTWYHKDBN
- if ( cread==15 )
- {
- int idx = is_fwd ? icycle : read_len-icycle-1;
- if ( idx>stats->max_len )
- error("mpc: %d>%d\n",idx,stats->max_len);
- idx = idx*stats->nquals;
- if ( idx>=stats->nquals*stats->nbases )
- error("FIXME: mpc_buf overflow\n");
- mpc_buf[idx]++;
- }
- else if ( cref && cread && cref!=cread )
- {
- uint8_t qual = quals[iread] + 1;
- if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
-
- int idx = is_fwd ? icycle : read_len-icycle-1;
- if ( idx>stats->max_len )
- error("mpc: %d>%d\n",idx,stats->max_len);
-
- idx = idx*stats->nquals + qual;
- if ( idx>=stats->nquals*stats->nbases )
- error("FIXME: mpc_buf overflow\n");
- mpc_buf[idx]++;
- }
-
- iref++;
- iread++;
- icycle++;
- }
- }
-}
-
-void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
-{
- khash_t(kh_faidx) *h;
- khiter_t iter;
- faidx1_t val;
- char *chr, c;
- faidx_t *fai = stats->fai;
-
- h = fai->hash;
- chr = stats->sam->header->target_name[tid];
-
- // ID of the sequence name
- iter = kh_get(kh_faidx, h, chr);
- if (iter == kh_end(h))
- error("No such reference sequence [%s]?\n", chr);
- val = kh_value(h, iter);
-
- // Check the boundaries
- if (pos >= val.len)
- error("Was the bam file mapped with the reference sequence supplied?"
- " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
- int size = stats->mrseq_buf;
- // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
- if (size+pos > val.len) size = val.len-pos;
-
- // Position the razf reader
- razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
-
- uint8_t *ptr = stats->rseq_buf;
- int nread = 0;
- while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
- {
- if ( !isgraph(c) )
- continue;
-
- // Conversion between uint8_t coding and ACGT
- // -12-4---8-------
- // =ACMGRSVTWYHKDBN
- if ( c=='A' || c=='a' )
- *ptr = 1;
- else if ( c=='C' || c=='c' )
- *ptr = 2;
- else if ( c=='G' || c=='g' )
- *ptr = 4;
- else if ( c=='T' || c=='t' )
- *ptr = 8;
- else
- *ptr = 0;
- ptr++;
- nread++;
- }
- if ( nread < stats->mrseq_buf )
- {
- memset(ptr,0, stats->mrseq_buf - nread);
- nread = stats->mrseq_buf;
- }
- stats->nrseq_buf = nread;
- stats->rseq_pos = pos;
- stats->tid = tid;
-}
-
-float fai_gc_content(stats_t *stats, int pos, int len)
-{
- uint32_t gc,count,c;
- int i = pos - stats->rseq_pos, ito = i + len;
- assert( i>=0 && ito<=stats->nrseq_buf );
-
- // Count GC content
- gc = count = 0;
- for (; i<ito; i++)
- {
- c = stats->rseq_buf[i];
- if ( c==2 || c==4 )
- {
- gc++;
- count++;
- }
- else if ( c==1 || c==8 )
- count++;
- }
- return count ? (float)gc/count : 0;
-}
-
-void realloc_rseq_buffer(stats_t *stats)
-{
- int n = stats->nbases*10;
- if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
- if ( stats->mrseq_buf<n )
- {
- stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
- stats->mrseq_buf = n;
- }
-}
-
-void realloc_gcd_buffer(stats_t *stats, int seq_len)
-{
- if ( seq_len >= stats->gcd_bin_size )
- error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
-
- int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
- if ( n <= stats->igcd )
- error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
-
- if ( n > stats->ngcd )
- {
- stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
- if ( !stats->gcd )
- error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
- memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
- stats->ngcd = n;
- }
-
- realloc_rseq_buffer(stats);
-}
-
-void realloc_buffers(stats_t *stats, int seq_len)
-{
- int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
-
- stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
- if ( !stats->quals_1st )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
- memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
-
- stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
- if ( !stats->quals_2nd )
- error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
- memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
-
- if ( stats->mpc_buf )
- {
- stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
- if ( !stats->mpc_buf )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
- memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
- }
-
- stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
- if ( !stats->acgt_cycles )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
- memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
-
- stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
- if ( !stats->read_lengths )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
- memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
- if ( !stats->insertions )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
- memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
- if ( !stats->deletions )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
- memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
- if ( !stats->ins_cycles_1st )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
- memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
- if ( !stats->ins_cycles_2nd )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
- memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
- if ( !stats->del_cycles_1st )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
- memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
- if ( !stats->del_cycles_2nd )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
- memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
-
- stats->nbases = n;
-
- // Realloc the coverage distribution buffer
- int *rbuffer = calloc(sizeof(int),seq_len*5);
- n = stats->cov_rbuf.size-stats->cov_rbuf.start;
- memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
- if ( stats->cov_rbuf.start>1 )
- memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
- stats->cov_rbuf.start = 0;
- free(stats->cov_rbuf.buffer);
- stats->cov_rbuf.buffer = rbuffer;
- stats->cov_rbuf.size = seq_len*5;
-
- realloc_rseq_buffer(stats);
-}
-
-void collect_stats(bam1_t *bam_line, stats_t *stats)
-{
- if ( stats->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(bam_line, "RG");
- if ( !rg ) return;
- khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
- if ( k == kh_end(stats->rg_hash) ) return;
- }
- if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
- {
- stats->nreads_filtered++;
- return;
- }
- if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
- {
- stats->nreads_filtered++;
- return;
- }
- if ( !is_in_regions(bam_line,stats) )
- return;
- if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
- return;
-
- if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
- if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
-
- int seq_len = bam_line->core.l_qseq;
- if ( !seq_len ) return;
-
- if ( seq_len >= stats->nbases )
- realloc_buffers(stats,seq_len);
- if ( stats->max_len<seq_len )
- stats->max_len = seq_len;
-
- stats->read_lengths[seq_len]++;
-
- // Count GC and ACGT per cycle
- uint8_t base, *seq = bam1_seq(bam_line);
- int gc_count = 0;
- int i;
- int reverse = IS_REVERSE(bam_line);
- for (i=0; i<seq_len; i++)
- {
- // Conversion from uint8_t coding to ACGT
- // -12-4---8-------
- // =ACMGRSVTWYHKDBN
- // 01 2 3
- base = bam1_seqi(seq,i);
- base /= 2;
- if ( base==1 || base==2 ) gc_count++;
- else if ( base>2 ) base=3;
- if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
- error("FIXME: acgt_cycles\n");
- stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
- }
- int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
- int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
- if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
-
- // Determine which array (1st or 2nd read) will these stats go to,
- // trim low quality bases from end the same way BWA does,
- // fill GC histogram
- uint64_t *quals;
- uint8_t *bam_quals = bam1_qual(bam_line);
- if ( bam_line->core.flag&BAM_FREAD2 )
- {
- quals = stats->quals_2nd;
- stats->nreads_2nd++;
- for (i=gc_idx_min; i<gc_idx_max; i++)
- stats->gc_2nd[i]++;
- }
- else
- {
- quals = stats->quals_1st;
- stats->nreads_1st++;
- for (i=gc_idx_min; i<gc_idx_max; i++)
- stats->gc_1st[i]++;
- }
- if ( stats->trim_qual>0 )
- stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
-
- // Quality histogram and average quality
- for (i=0; i<seq_len; i++)
- {
- uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
- if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
- if ( qual>stats->max_qual )
- stats->max_qual = qual;
-
- quals[ i*stats->nquals+qual ]++;
- stats->sum_qual += qual;
- }
-
- // Look at the flags and increment appropriate counters (mapped, paired, etc)
- if ( IS_UNMAPPED(bam_line) )
- stats->nreads_unmapped++;
- else
- {
- if ( !bam_line->core.qual )
- stats->nreads_mq0++;
-
- count_indels(stats,bam_line);
-
- if ( !IS_PAIRED(bam_line) )
- stats->nreads_unpaired++;
- else
- {
- stats->nreads_paired++;
-
- if ( bam_line->core.tid!=bam_line->core.mtid )
- stats->nreads_anomalous++;
-
- // The insert size is tricky, because for long inserts the libraries are
- // prepared differently and the pairs point in other direction. BWA does
- // not set the paired flag for them. Similar thing is true also for 454
- // reads. Mates mapped to different chromosomes have isize==0.
- int32_t isize = bam_line->core.isize;
- if ( isize<0 ) isize = -isize;
- if ( isize >= stats->nisize )
- isize = stats->nisize-1;
- if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
- {
- int pos_fst = bam_line->core.mpos - bam_line->core.pos;
- int is_fst = IS_READ1(bam_line) ? 1 : -1;
- int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
- int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
-
- if ( is_fwd*is_mfwd>0 )
- stats->isize_other[isize]++;
- else if ( is_fst*pos_fst>0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize_inward[isize]++;
- else
- stats->isize_outward[isize]++;
- }
- else if ( is_fst*pos_fst<0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize_outward[isize]++;
- else
- stats->isize_inward[isize]++;
- }
- }
- }
-
- // Number of mismatches
- uint8_t *nm = bam_aux_get(bam_line,"NM");
- if (nm)
- stats->nmismatches += bam_aux2i(nm);
-
- // Number of mapped bases from cigar
- // Conversion from uint32_t to MIDNSHP
- // 012-4--
- // MIDNSHP
- if ( bam_line->core.n_cigar == 0)
- error("FIXME: mapped read with no cigar?\n");
- int readlen=seq_len;
- if ( stats->regions )
- {
- // Count only on-target bases
- int iref = bam_line->core.pos + 1;
- for (i=0; i<bam_line->core.n_cigar; i++)
- {
- int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
- int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
- if ( cig==2 ) readlen += ncig;
- else if ( cig==0 )
- {
- if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
- else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
- if ( ncig<0 ) ncig = 0;
- stats->nbases_mapped_cigar += ncig;
- iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
- }
- else if ( cig==1 )
- {
- iref += ncig;
- if ( iref>=stats->reg_from && iref<=stats->reg_to )
- stats->nbases_mapped_cigar += ncig;
- }
- }
- }
- else
- {
- // Count the whole read
- for (i=0; i<bam_line->core.n_cigar; i++)
- {
- if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
- stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
- if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
- readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
- }
- }
- stats->nbases_mapped += seq_len;
-
- if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
- stats->is_sorted = 0;
- stats->pos = bam_line->core.pos;
-
- if ( stats->is_sorted )
- {
- if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
- round_buffer_flush(stats,-1);
-
- // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
- // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
- // 20kbp, so the effect is negligible.
- if ( stats->fai )
- {
- int inc_ref = 0, inc_gcd = 0;
- // First pass or new chromosome
- if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
- // Read goes beyond the end of the rseq buffer
- else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
- // Read overlaps the next gcd bin
- else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
- {
- inc_gcd = 1;
- if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
- }
- if ( inc_gcd )
- {
- stats->igcd++;
- if ( stats->igcd >= stats->ngcd )
- realloc_gcd_buffer(stats, readlen);
- if ( inc_ref )
- read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
- stats->gcd_pos = bam_line->core.pos;
- stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
- }
-
- count_mismatches_per_cycle(stats,bam_line);
- }
- // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
- else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
- {
- // First pass or a new chromosome
- stats->tid = bam_line->core.tid;
- stats->gcd_pos = bam_line->core.pos;
- stats->igcd++;
- if ( stats->igcd >= stats->ngcd )
- realloc_gcd_buffer(stats, readlen);
- }
- stats->gcd[ stats->igcd ].depth++;
- // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
- if ( !stats->fai )
- stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
-
- // Coverage distribution graph
- round_buffer_flush(stats,bam_line->core.pos);
- round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
- }
- }
-
- stats->total_len += seq_len;
- if ( IS_DUP(bam_line) )
- {
- stats->total_len_dup += seq_len;
- stats->nreads_dup++;
- }
-}
-
-// Sort by GC and depth
-#define GCD_t(x) ((gc_depth_t *)x)
-static int gcd_cmp(const void *a, const void *b)
-{
- if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
- if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
- if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
- if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
- return 0;
-}
-#undef GCD_t
-
-float gcd_percentile(gc_depth_t *gcd, int N, int p)
-{
- float n,d;
- int k;
-
- n = p*(N+1)/100;
- k = n;
- if ( k<=0 )
- return gcd[0].depth;
- if ( k>=N )
- return gcd[N-1].depth;
-
- d = n - k;
- return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
-}
-
-void output_stats(stats_t *stats)
-{
- // Calculate average insert size and standard deviation (from the main bulk data only)
- int isize, ibulk=0;
- uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
- for (isize=0; isize<stats->nisize; isize++)
- {
- // Each pair was counted twice
- stats->isize_inward[isize] *= 0.5;
- stats->isize_outward[isize] *= 0.5;
- stats->isize_other[isize] *= 0.5;
-
- nisize_inward += stats->isize_inward[isize];
- nisize_outward += stats->isize_outward[isize];
- nisize_other += stats->isize_other[isize];
- nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
- }
-
- double bulk=0, avg_isize=0, sd_isize=0;
- for (isize=0; isize<stats->nisize; isize++)
- {
- bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
- avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
-
- if ( bulk/nisize > stats->isize_main_bulk )
- {
- ibulk = isize+1;
- nisize = bulk;
- break;
- }
- }
- avg_isize /= nisize ? nisize : 1;
- for (isize=1; isize<ibulk; isize++)
- sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
- sd_isize = sqrt(sd_isize);
-
-
- printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
- printf("# The command line was: %s",stats->argv[0]);
- int i;
- for (i=1; i<stats->argc; i++)
- printf(" %s",stats->argv[i]);
- printf("\n");
- printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
- printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
- printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
- printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
- printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
- printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
- printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
- printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
- printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
- printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
- printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
- printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
- printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
- printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
- printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
- printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
- printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
- printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
- printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
- printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
- printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
- printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
- printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
- printf("SN\taverage length:\t%.0f\n", avg_read_length);
- printf("SN\tmaximum length:\t%d\n", stats->max_len);
- printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
- printf("SN\tinsert size average:\t%.1f\n", avg_isize);
- printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
- printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
- printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
- printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
- printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
-
- int ibase,iqual;
- if ( stats->max_len<stats->nbases ) stats->max_len++;
- if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
- printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
- for (ibase=0; ibase<stats->max_len; ibase++)
- {
- printf("FFQ\t%d",ibase+1);
- for (iqual=0; iqual<=stats->max_qual; iqual++)
- {
- printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
- }
- printf("\n");
- }
- printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
- for (ibase=0; ibase<stats->max_len; ibase++)
- {
- printf("LFQ\t%d",ibase+1);
- for (iqual=0; iqual<=stats->max_qual; iqual++)
- {
- printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
- }
- printf("\n");
- }
- if ( stats->mpc_buf )
- {
- printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
- printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
- printf("# is the number of N's and the rest is the number of mismatches\n");
- for (ibase=0; ibase<stats->max_len; ibase++)
- {
- printf("MPC\t%d",ibase+1);
- for (iqual=0; iqual<=stats->max_qual; iqual++)
- {
- printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
- }
- printf("\n");
- }
- }
- printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
- int ibase_prev = 0;
- for (ibase=0; ibase<stats->ngc; ibase++)
- {
- if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
- printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
- ibase_prev = ibase;
- }
- printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
- ibase_prev = 0;
- for (ibase=0; ibase<stats->ngc; ibase++)
- {
- if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
- printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
- ibase_prev = ibase;
- }
- printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
- for (ibase=0; ibase<stats->max_len; ibase++)
- {
- uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
- uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
- if ( ! sum ) continue;
- printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
- }
- printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
- for (isize=0; isize<ibulk; isize++)
- printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
- (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
-
- printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
- int ilen;
- for (ilen=0; ilen<stats->max_len; ilen++)
- {
- if ( stats->read_lengths[ilen]>0 )
- printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
- }
-
- printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
- for (ilen=0; ilen<stats->nindels; ilen++)
- {
- if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
- printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
- }
-
- printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
- for (ilen=0; ilen<=stats->nbases; ilen++)
- {
- // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
- // the index of the cycle of the first inserted base (also 1-based)
- if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
- printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
- }
-
- printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
- if ( stats->cov[0] )
- printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
- int icov;
- for (icov=1; icov<stats->ncov-1; icov++)
- if ( stats->cov[icov] )
- printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
- if ( stats->cov[stats->ncov-1] )
- printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
-
- // Calculate average GC content, then sort by GC and depth
- printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
- uint32_t igcd;
- for (igcd=0; igcd<stats->igcd; igcd++)
- {
- if ( stats->fai )
- stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
- else
- if ( stats->gcd[igcd].depth )
- stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
- }
- qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
- igcd = 0;
- while ( igcd < stats->igcd )
- {
- // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
- uint32_t nbins=0, itmp=igcd;
- float gc = stats->gcd[igcd].gc;
- while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
- {
- nbins++;
- itmp++;
- }
- printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
- gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
- gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
- );
- igcd += nbins;
- }
-}
-
-size_t mygetline(char **line, size_t *n, FILE *fp)
-{
- if (line == NULL || n == NULL || fp == NULL)
- {
- errno = EINVAL;
- return -1;
- }
- if (*n==0 || !*line)
- {
- *line = NULL;
- *n = 0;
- }
-
- size_t nread=0;
- int c;
- while ((c=getc(fp))!= EOF && c!='\n')
- {
- if ( ++nread>=*n )
- {
- *n += 255;
- *line = realloc(*line, sizeof(char)*(*n));
- }
- (*line)[nread-1] = c;
- }
- if ( nread>=*n )
- {
- *n += 255;
- *line = realloc(*line, sizeof(char)*(*n));
- }
- (*line)[nread] = 0;
- return nread>0 ? nread : -1;
-
-}
-
-void init_regions(stats_t *stats, char *file)
-{
- khiter_t iter;
- khash_t(kh_bam_tid) *header_hash;
-
- bam_init_header_hash(stats->sam->header);
- header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
-
- FILE *fp = fopen(file,"r");
- if ( !fp ) error("%s: %s\n",file,strerror(errno));
-
- char *line = NULL;
- size_t len = 0;
- ssize_t nread;
- int warned = 0;
- int prev_tid=-1, prev_pos=-1;
- while ((nread = mygetline(&line, &len, fp)) != -1)
- {
- if ( line[0] == '#' ) continue;
-
- int i = 0;
- while ( i<nread && !isspace(line[i]) ) i++;
- if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
- line[i] = 0;
-
- iter = kh_get(kh_bam_tid, header_hash, line);
- int tid = kh_val(header_hash, iter);
- if ( iter == kh_end(header_hash) )
- {
- if ( !warned )
- fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
- warned = 1;
- continue;
- }
-
- if ( tid >= stats->nregions )
- {
- stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
- int j;
- for (j=stats->nregions; j<stats->nregions+100; j++)
- {
- stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
- stats->regions[j].pos = NULL;
- }
- stats->nregions += 100;
- }
- int npos = stats->regions[tid].npos;
- if ( npos >= stats->regions[tid].mpos )
- {
- stats->regions[tid].mpos += 1000;
- stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
- }
-
- if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
- if ( prev_tid==-1 || prev_tid!=tid )
- {
- prev_tid = tid;
- prev_pos = stats->regions[tid].pos[npos].from;
- }
- if ( prev_pos>stats->regions[tid].pos[npos].from )
- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
- stats->regions[tid].npos++;
- }
- if (line) free(line);
- if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
- fclose(fp);
-}
-
-void destroy_regions(stats_t *stats)
-{
- int i;
- for (i=0; i<stats->nregions; i++)
- {
- if ( !stats->regions[i].mpos ) continue;
- free(stats->regions[i].pos);
- }
- if ( stats->regions ) free(stats->regions);
-}
-
-static int fetch_read(const bam1_t *bam_line, void *data)
-{
- collect_stats((bam1_t*)bam_line,(stats_t*)data);
- return 1;
-}
-
-void reset_regions(stats_t *stats)
-{
- int i;
- for (i=0; i<stats->nregions; i++)
- stats->regions[i].cpos = 0;
-}
-
-int is_in_regions(bam1_t *bam_line, stats_t *stats)
-{
- if ( !stats->regions ) return 1;
-
- if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
- if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
-
- regions_t *reg = &stats->regions[bam_line->core.tid];
- if ( reg->cpos==reg->npos ) return 0; // done for this chr
-
- // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
- // even small overlap is enough to include the read in the stats.
- int i = reg->cpos;
- while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
- if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
- if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
- reg->cpos = i;
- stats->reg_from = reg->pos[i].from;
- stats->reg_to = reg->pos[i].to;
-
- return 1;
-}
-
-void init_group_id(stats_t *stats, char *id)
-{
- if ( !stats->sam->header->dict )
- stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
- void *iter = stats->sam->header->dict;
- const char *key, *val;
- int n = 0;
- stats->rg_hash = kh_init(kh_rg);
- while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
- {
- if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
- {
- khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
- if ( k != kh_end(stats->rg_hash) )
- fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
- int ret;
- k = kh_put(kh_rg, stats->rg_hash, key, &ret);
- kh_value(stats->rg_hash, k) = val;
- n++;
- }
- }
- if ( !n )
- error("The sample or read group \"%s\" not present.\n", id);
-}
-
-
-void error(const char *format, ...)
-{
- if ( !format )
- {
- printf("Version: %s\n", BAMCHECK_VERSION);
- printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
- printf("Usage: bamcheck [OPTIONS] file.bam\n");
- printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
- printf("Options:\n");
- printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
- printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
- printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
- printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
- printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
- printf(" -h, --help This help message\n");
- printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
- printf(" -I, --id <string> Include only listed read group or sample name\n");
- printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
- printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
- printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
- printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
- printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
- printf(" -s, --sam Input is SAM\n");
- printf("\n");
- }
- else
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- }
- exit(-1);
-}
-
-int main(int argc, char *argv[])
-{
- char *targets = NULL;
- char *bam_fname = NULL;
- char *group_id = NULL;
- samfile_t *sam = NULL;
- char in_mode[5];
-
- stats_t *stats = calloc(1,sizeof(stats_t));
- stats->ngc = 200;
- stats->nquals = 256;
- stats->nbases = 300;
- stats->nisize = 8000;
- stats->max_len = 30;
- stats->max_qual = 40;
- stats->isize_main_bulk = 0.99; // There are always outliers at the far end
- stats->gcd_bin_size = 20e3;
- stats->gcd_ref_size = 4.2e9;
- stats->rseq_pos = -1;
- stats->tid = stats->gcd_pos = -1;
- stats->igcd = 0;
- stats->is_sorted = 1;
- stats->cov_min = 1;
- stats->cov_max = 1000;
- stats->cov_step = 1;
- stats->argc = argc;
- stats->argv = argv;
- stats->filter_readlen = -1;
- stats->nindels = stats->nbases;
-
- strcpy(in_mode, "rb");
-
- static struct option loptions[] =
- {
- {"help",0,0,'h'},
- {"remove-dups",0,0,'d'},
- {"sam",0,0,'s'},
- {"ref-seq",1,0,'r'},
- {"coverage",1,0,'c'},
- {"read-length",1,0,'l'},
- {"insert-size",1,0,'i'},
- {"most-inserts",1,0,'m'},
- {"trim-quality",1,0,'q'},
- {"target-regions",0,0,'t'},
- {"required-flag",1,0,'f'},
- {"filtering-flag",0,0,'F'},
- {"id",1,0,'I'},
- {"GC-depth",1,0,1},
- {0,0,0,0}
- };
- int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
- {
- switch (opt)
- {
- case 'f': stats->flag_require=strtol(optarg,0,0); break;
- case 'F': stats->flag_filter=strtol(optarg,0,0); break;
- case 'd': stats->flag_filter|=BAM_FDUP; break;
- case 's': strcpy(in_mode, "r"); break;
- case 'r': stats->fai = fai_load(optarg);
- if (stats->fai==0)
- error("Could not load faidx: %s\n", optarg);
- break;
- case 1 : {
- float flen,fbin;
- if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
- error("Unable to parse --GC-depth %s\n", optarg);
- stats->gcd_bin_size = fbin;
- stats->gcd_ref_size = flen;
- }
- break;
- case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
- error("Unable to parse -c %s\n", optarg);
- break;
- case 'l': stats->filter_readlen = atoi(optarg); break;
- case 'i': stats->nisize = atoi(optarg); break;
- case 'm': stats->isize_main_bulk = atof(optarg); break;
- case 'q': stats->trim_qual = atoi(optarg); break;
- case 't': targets = optarg; break;
- case 'I': group_id = optarg; break;
- case '?':
- case 'h': error(NULL);
- default: error("Unknown argument: %s\n", optarg);
- }
- }
- if ( optind<argc )
- bam_fname = argv[optind++];
-
- if ( !bam_fname )
- {
- if ( isatty(fileno((FILE *)stdin)) )
- error(NULL);
- bam_fname = "-";
- }
-
- // Init structures
- // .. coverage bins and round buffer
- if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
- {
- stats->cov_step = stats->cov_max - stats->cov_min;
- if ( stats->cov_step <= 0 )
- stats->cov_step = 1;
- }
- stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
- stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
- stats->cov = calloc(sizeof(uint64_t),stats->ncov);
- stats->cov_rbuf.size = stats->nbases*5;
- stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
- // .. bam
- if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
- error("Failed to open: %s\n", bam_fname);
- stats->sam = sam;
- if ( group_id ) init_group_id(stats, group_id);
- bam1_t *bam_line = bam_init1();
- // .. arrays
- stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
- stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
- stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
- stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
- stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
- stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
- stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
- stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
- stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
- stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
- stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
- realloc_rseq_buffer(stats);
- if ( targets )
- init_regions(stats, targets);
-
- // Collect statistics
- if ( optind<argc )
- {
- // Collect stats in selected regions only
- bam_index_t *bam_idx = bam_index_load(bam_fname);
- if (bam_idx == 0)
- error("Random alignment retrieval only works for indexed BAM files.\n");
-
- int i;
- for (i=optind; i<argc; i++)
- {
- int tid, beg, end;
- bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
- if ( tid < 0 ) continue;
- reset_regions(stats);
- bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
- }
- bam_index_destroy(bam_idx);
- }
- else
- {
- // Stream through the entire BAM ignoring off-target regions if -t is given
- while (samread(sam,bam_line) >= 0)
- collect_stats(bam_line,stats);
- }
- round_buffer_flush(stats,-1);
-
- output_stats(stats);
-
- bam_destroy1(bam_line);
- samclose(stats->sam);
- if (stats->fai) fai_destroy(stats->fai);
- free(stats->cov_rbuf.buffer); free(stats->cov);
- free(stats->quals_1st); free(stats->quals_2nd);
- free(stats->gc_1st); free(stats->gc_2nd);
- free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
- free(stats->gcd);
- free(stats->rseq_buf);
- free(stats->mpc_buf);
- free(stats->acgt_cycles);
- free(stats->read_lengths);
- free(stats->insertions);
- free(stats->deletions);
- free(stats->ins_cycles_1st);
- free(stats->ins_cycles_2nd);
- free(stats->del_cycles_1st);
- free(stats->del_cycles_2nd);
- destroy_regions(stats);
- free(stats);
- if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
-
- return 0;
-}
-
-
-
diff --git a/sam/misc/blast2sam.pl b/sam/misc/blast2sam.pl
deleted file mode 100755
index 084f018..0000000
--- a/sam/misc/blast2sam.pl
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-&blast2sam;
-
-sub blast2sam {
- my %opts = ();
- getopts('s', \%opts);
- die("Usage: blast2sam.pl <in.blastn>\n") if (-t STDIN && @ARGV == 0);
- my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq);
- $show_seq = defined($opts{s});
- @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*');
- while (<>) {
- if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print
- &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend);
- @cigar = ();
- }
- if (/^Query= (\S+)/) {
- $sam[0] = $1;
- } elsif (/$(\S+)\s+letters$/) {
- $qlen = $1; $qlen =~ s/,//g;
- } elsif (/^>(\S+)/) {
- $sam[2] = $1;
- } elsif (/Length = (\d+)/) {
- $slen = $1;
- } elsif (/Score =\s+(\S+) bits.+Expect($\d+$)? = (\S+)/) { # the start of an alignment block
- my ($as, $ev) = (int($1 + .499), $3);
- $ev = "1$ev" if ($ev =~ /^e/);
- @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev");
- @cigar = (); $qbeg = 0;
- @cmaux = (0, 0, 0, '');
- } elsif (/Strand = (\S+) \/ (\S+)/) {
- $sam[1] |= 0x10 if ($2 eq 'Minus');
- } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) {
- $q = $2;
- unless ($qbeg) {
- $qbeg = $1;
- push(@cigar, ($1-1) . "H") if ($1 > 1);
- }
- $qend = $3;
- if ($show_seq) {
- my $x = $q;
- $x =~ s/-//g; $sam[9] .= $x;
- }
- } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) {
- $s = $2;
- if ($sam[1] & 0x10) {
- $sam[3] = $3;
- } else {
- $sam[3] = $1 unless ($sam[3]);
- }
- &aln2cm(\@cigar, \$q, \$s, \@cmaux);
- }
- }
- &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend);
-}
-
-sub blast_print_sam {
- my ($sam, $cigar, $cmaux, $qrest) = @_;
- push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1));
- push(@$cigar, $qrest . 'H') if ($qrest);
- if ($sam->[1] & 0x10) {
- @$cigar = reverse(@$cigar);
- $sam->[9] = reverse($sam->[9]);
- $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/;
- }
- $sam->[9] = '*' if (!$sam->[9]);
- $sam->[5] = join('', @$cigar);
- print join("\t", @$sam), "\n";
-}
-
-sub aln2cm {
- my ($cigar, $q, $s, $cmaux) = @_;
- my $l = length($$q);
- for (my $i = 0; $i < $l; ++$i) {
- my $op;
- # set $op
- if (substr($$q, $i, 1) eq '-') { $op = 2; }
- elsif (substr($$s, $i, 1) eq '-') { $op = 1; }
- else { $op = 0; }
- # for CIGAR
- if ($cmaux->[0] == $op) {
- ++$cmaux->[1];
- } else {
- push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1));
- $cmaux->[0] = $op; $cmaux->[1] = 1;
- }
- }
-}
diff --git a/sam/misc/bowtie2sam.pl b/sam/misc/bowtie2sam.pl
deleted file mode 100755
index 5dff88d..0000000
--- a/sam/misc/bowtie2sam.pl
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/perl -w
-
-# Contact: lh3
-# Version: 0.1.1
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-&bowtie2sam;
-exit;
-
-sub bowtie2sam {
- my %opts = ();
- die("Usage: bowtie2sam.pl <aln.bowtie>\n") if (@ARGV == 0 && -t STDIN);
- # core loop
- my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k);
- $last = '';
- while (<>) {
- my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches
- if ($name eq $last) {
- # I do not know whether the multiple hits are ordered on the
- # number of mismatches. I assume they are not and so I have to
- # keep all these multiple hits in memory.
- @{$staging[$k]} = @s;
- if ($best_s > $nm) {
- $subbest_s = $best_s;
- $best_s = $nm;
- $best_k = $k;
- } elsif ($subbest_s > $nm) {
- $subbest_s = $nm;
- }
- ++$k;
- } else {
- if ($last) {
- if ($best_s == $subbest_s) {
- $staging[$best_k][4] = 0;
- } elsif ($subbest_s - $best_s == 1) {
- $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15);
- }
- print join("\t", @{$staging[$best_k]}), "\n";
- }
- $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0;
- @{$staging[0]} = @s;
- $last = $name;
- }
- }
- print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0);
-}
-
-sub bowtie2sam_aux {
- my ($line, $s) = @_;
- chomp($line);
- my @t = split("\t", $line);
- my $ret;
- @$s = ();
- # read name
- $s->[0] = $ret = $t[0];
- $s->[0] =~ s/\/[12]$//g;
- # initial flag (will be updated later)
- $s->[1] = 0;
- # read & quality
- $s->[9] = $t[4]; $s->[10] = $t[5];
- # cigar
- $s->[5] = length($s->[9]) . "M";
- # coor
- $s->[2] = $t[2]; $s->[3] = $t[3] + 1;
- $s->[1] |= 0x10 if ($t[1] eq '-');
- # mapQ
- $s->[4] = $t[6] == 0? 25 : 0;
- # mate coordinate
- $s->[6] = '*'; $s->[7] = $s->[8] = 0;
- # aux
- my $nm = @t - 7;
- push(@$s, "NM:i:" . (@t-7));
- push(@$s, "X$nm:i:" . ($t[6]+1));
- my $md = '';
- if ($t[7]) {
- $_ = $t[7];
- my $a = 0;
- while (/(\d+):[ACGTN]>([ACGTN])/gi) {
- my ($y, $z) = ($1, $2);
- $md .= (int($y)-$a) . $z;
- $a += $y - $a + 1;
- }
- $md .= length($s->[9]) - $a;
- } else {
- $md = length($s->[9]);
- }
- push(@$s, "MD:Z:$md");
- return ($ret, $nm);
-}
diff --git a/sam/misc/export2sam.pl b/sam/misc/export2sam.pl
deleted file mode 100755
index ec6dacf..0000000
--- a/sam/misc/export2sam.pl
+++ /dev/null
@@ -1,545 +0,0 @@
-#!/usr/bin/env perl
-#
-#
-# export2sam.pl converts GERALD export files to SAM format.
-#
-#
-#
-########## License:
-#
-# The MIT License
-#
-# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
-# Modified SAMtools work copyright (c) 2010 Illumina, Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-#
-#
-#
-########## ChangeLog:
-#
-# Version: 2.3.1 (18MAR2011)
-#
-# - Restore file '-' as stdin input.
-#
-# Version: 2.3.0 (24JAN2011)
-#
-# - Add support for export reserved chromosome name "CONTROL",
-# which is translated to optional field "XC:Z:CONTROL".
-# - Check for ".gz" file extension on export files and open
-# these as gzip pipes when the extension is found.
-#
-# Version: 2.2.0 (16NOV2010)
-#
-# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y
-# - For export records with reserved chromosome name identifiers
-# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"
-# to the SAM record, so that these cases can be distinguished
-# from other unmatched reads.
-#
-# Version: 2.1.0 (21SEP2010)
-#
-# - Additional export record error checking.
-# - Convert export records with chromomsome value of "RM" to unmapped
-# SAM records.
-#
-# Version: 2.0.0 (15FEB2010)
-#
-# Script updated by Illumina in conjunction with CASAVA 1.7.0
-# release.
-#
-# Major changes are as follows:
-# - The CIGAR string has been updated to include all gaps from
-# ELANDv2 alignments.
-# - The ELAND single read alignment score is always stored in the
-# optional "SM" field and the ELAND paired read alignment score
-# is stored in the optional "AS" field when it exists.
-# - The MAPQ value is set to the higher of the two alignment scores,
-# but no greater than 254, i.e. min(254,max(SM,AS))
-# - The SAM "proper pair" bit (0x0002) is now set for read pairs
-# meeting ELAND's expected orientation and insert size criteria.
-# - The default quality score translation is set for export files
-# which contain Phread+64 quality values. An option,
-# "--qlogodds", has been added to translate quality values from
-# the Solexa+64 format used in export files prior to Pipeline
-# 1.3
-# - The export match descriptor is now reverse-complemented when
-# necessary such that it always corresponds to the forward
-# strand of the reference, to be consistent with other
-# information in the SAM record. It is now written to the
-# optional 'XD' field (rather than 'MD') to acknowledge its
-# minor differences from the samtools match descriptor (see
-# additional detail below).
-# - An option, "--nofilter", has been added to include reads which
-# have failed primary analysis quality filtration. Such reads
-# will have the corresponding SAM flag bit (0x0200) set.
-# - Labels in the export 'contig' field are preserved by setting
-# RNAME to "$export_chromosome/$export_contig" when the contig
-# label exists.
-#
-#
-# Contact: lh3
-# Version: 0.1.2 (03JAN2009)
-#
-#
-#
-########## Known Conversion Limitations:
-#
-# - Export records for reads that map to a position < 1 (allowed
-# in export format), are converted to unmapped reads in the SAM
-# record.
-# - Export records contain the reserved chromosome names: "NM",
-# "QC","RM" and "CONTROL". "NM" indicates that the aligner could
-# not map the read to the reference sequence set. "QC" means that
-# the aligner did not attempt to map the read due to some
-# technical limitation. "RM" means that the read mapped to a set
-# of 'contaminant' sequences specified in GERALD's RNA-seq
-# workflow. "CONTROL" means that the read is a control. All of
-# these alignment types are collapsed to the single unmapped
-# alignment state in the SAM record, but the optional SAM "XC"
-# field is used to record the original reserved chromosome name of
-# the read for all but the "NM" case.
-# - The export match descriptor is slightly different than the
-# samtools match descriptor. For this reason it is stored in the
-# optional SAM field 'XD' (and not 'MD'). Note that the export
-# match descriptor differs from the samtools version in two
-# respects: (1) indels are explicitly closed with the '$'
-# character and (2) insertions must be enumerated in the match
-# descriptor. For example a 35-base read with a two-base insertion
-# is described as: 20^2$14
-#
-#
-#
-
-my $version = "2.3.1";
-
-use strict;
-use warnings;
-
-use Getopt::Long;
-use File::Spec;
-use List::Util qw(min max);
-
-
-use constant {
- EXPORT_MACHINE => 0,
- EXPORT_RUNNO => 1,
- EXPORT_LANE => 2,
- EXPORT_TILE => 3,
- EXPORT_X => 4,
- EXPORT_Y => 5,
- EXPORT_INDEX => 6,
- EXPORT_READNO => 7,
- EXPORT_READ => 8,
- EXPORT_QUAL => 9,
- EXPORT_CHROM => 10,
- EXPORT_CONTIG => 11,
- EXPORT_POS => 12,
- EXPORT_STRAND => 13,
- EXPORT_MD => 14,
- EXPORT_SEMAP => 15,
- EXPORT_PEMAP => 16,
- EXPORT_PASSFILT => 21,
- EXPORT_SIZE => 22,
-};
-
-
-use constant {
- SAM_QNAME => 0,
- SAM_FLAG => 1,
- SAM_RNAME => 2,
- SAM_POS => 3,
- SAM_MAPQ => 4,
- SAM_CIGAR => 5,
- SAM_MRNM => 6,
- SAM_MPOS => 7,
- SAM_ISIZE => 8,
- SAM_SEQ => 9,
- SAM_QUAL => 10,
-};
-
-
-# function prototypes for Richard's code
-sub match_desc_to_cigar($);
-sub match_desc_frag_length($);
-sub reverse_compl_match_descriptor($);
-sub write_header($;$;$);
-
-
-&export2sam;
-exit;
-
-
-
-
-sub export2sam {
-
- my $cmdline = $0 . " " . join(" ",@ARGV);
- my $arg_count = scalar @ARGV;
- my $progname = (File::Spec->splitpath($0))[2];
-
- my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
- my $is_nofilter = 0;
- my $read1file;
- my $read2file;
- my $print_version = 0;
- my $help = 0;
-
- my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
- "nofilter" => \$is_nofilter,
- "read1=s" => \$read1file,
- "read2=s" => \$read2file,
- "version" => \$print_version,
- "help" => \$help );
-
- my $usage = <<END;
-
-$progname converts GERALD export files to SAM format.
-
-Usage: $progname --read1=FILENAME [ options ] | --version | --help
-
- --read1=FILENAME read1 export file or '-' for stdin (mandatory)
- (file may be gzipped with ".gz" extension)
- --read2=FILENAME read2 export file or '-' for stdin
- (file may be gzipped with ".gz" extension)
- --nofilter include reads that failed the basecaller
- purity filter
- --qlogodds assume export file(s) use logodds quality values
- as reported by OLB (Pipeline) prior to v1.3
- (default: phred quality values)
-
-END
-
- my $version_msg = <<END;
-
-$progname version: $version
-
-END
-
- if((not $result) or $help or ($arg_count==0)) {
- die($usage);
- }
-
- if(@ARGV) {
- print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
- die($usage);
- }
-
- if($print_version) {
- die($version_msg);
- }
-
- if(not defined($read1file)) {
- print STDERR "\nERROR: read1 export file must be specified\n\n";
- die($usage);
- }
-
- unless((-f $read1file) or ($read1file eq '-')) {
- die("\nERROR: Can't find read1 export file: '$read1file'\n\n");
- }
-
- if (defined $read2file) {
- unless((-f $read2file) or ($read2file eq '-')) {
- die("\nERROR: Can't find read2 export file: '$read2file'\n\n");
- }
- if($read1file eq $read2file) {
- die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n");
- }
- }
-
- my ($fh1, $fh2, $is_paired);
-
- my $read1cmd="$read1file";
- $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/);
- open($fh1, $read1cmd)
- or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n");
- $is_paired = defined $read2file;
- if ($is_paired) {
- my $read2cmd="$read2file";
- $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/);
- open($fh2, $read2cmd)
- or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n");
- }
- # quality value conversion table
- my @conv_table;
- if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
- for (-64..64) {
- $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
- }
- } else { # convert from phred+64 quality values (pipeline v1.3+):
- for (-64..-1) {
- $conv_table[$_+64] = undef;
- }
- for (0..64) {
- $conv_table[$_+64] = int(33 + $_);
- }
- }
- # write the header
- print write_header( $progname, $version, $cmdline );
- # core loop
- my $export_line_count = 0;
- while (<$fh1>) {
- $export_line_count++;
- my (@s1, @s2);
- &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
- if ($is_paired) {
- my $read2line = <$fh2>;
- if(not $read2line){
- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
- }
- &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
-
- if (@s1 && @s2) { # then set mate coordinate
- if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
- die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
- }
-
- my $isize = 0;
- if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
- my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
- my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
- $isize = $x2 - $x1;
- }
-
- foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
- my ($sa,$sb,$is) = @{$_};
- if ($sb->[SAM_RNAME] ne '*') {
- $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
- $sa->[SAM_MPOS] = $sb->[SAM_POS];
- $sa->[SAM_ISIZE] = $is;
- $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
- } else {
- $sa->[SAM_FLAG] |= 0x8;
- }
- }
- }
- }
- print join("\t", @s1), "\n" if (@s1);
- print join("\t", @s2), "\n" if (@s2 && $is_paired);
- }
- close($fh1);
- if($is_paired) {
- while(my $read2line = <$fh2>){
- $export_line_count++;
- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
- }
- close($fh2);
- }
-}
-
-sub export2sam_aux {
- my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
- chomp($line);
- my @t = split("\t", $line);
- if(scalar(@t) < EXPORT_SIZE) {
- my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n";
- $msg.="\t...erroneous export record:\n" . $line . "\n\n";
- die($msg);
- }
- @$s = ();
- my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
- return if(not ($isPassFilt or $is_nofilter));
- # read name
- my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");
- $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),
- int($t[EXPORT_X]), int($t[EXPORT_Y]));
- # initial flag (will be updated later)
- $s->[SAM_FLAG] = 0;
- if($is_paired) {
- if($t[EXPORT_READNO] != $read_no){
- die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
- }
- $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
- }
- $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
-
- # read & quality
- my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
- if ($is_export_rev) { # then reverse the sequence and quality
- $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
- $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
- $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
- } else {
- $s->[SAM_SEQ] = $t[EXPORT_READ];
- $s->[SAM_QUAL] = $t[EXPORT_QUAL];
- }
- my @convqual = ();
- foreach (unpack('C*', $s->[SAM_QUAL])){
- my $val=$ct->[$_];
- if(not defined $val){
- my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
- if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
- die($msg . "\n");
- }
- push @convqual,$val;
- }
-
- $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
-
-
- # coor
- my $has_coor = 0;
- $s->[SAM_RNAME] = "*";
- if (($t[EXPORT_CHROM] eq 'NM') or
- ($t[EXPORT_CHROM] eq 'QC') or
- ($t[EXPORT_CHROM] eq 'RM') or
- ($t[EXPORT_CHROM] eq 'CONTROL')) {
- $s->[SAM_FLAG] |= 0x4; # unmapped
- push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');
- } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
- $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
- push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
- } elsif ($t[EXPORT_POS] < 1) {
- $s->[SAM_FLAG] |= 0x4; # unmapped
- } else {
- $s->[SAM_RNAME] = $t[EXPORT_CHROM];
- $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
- $has_coor = 1;
- }
- $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
-
-# print STDERR "t[14] = " . $t[14] . "\n";
- my $matchDesc = '';
- $s->[SAM_CIGAR] = "*";
- if($has_coor){
- $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
-
- if($matchDesc =~ /\^/){
- # construct CIGAR string using Richard's function
- $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
- } else {
- $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
- }
- }
-
-# print STDERR "cigar_string = $cigar_string\n";
-
- $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
- if($has_coor){
- my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
- my $pemap = 0;
- if($is_paired) {
- $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
-
- # set `proper pair' bit if non-blank, non-zero PE alignment score:
- $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
- }
- $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
- } else {
- $s->[SAM_MAPQ] = 0;
- }
- # mate coordinate
- $s->[SAM_MRNM] = '*';
- $s->[SAM_MPOS] = 0;
- $s->[SAM_ISIZE] = 0;
- # aux
- push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
- if($has_coor){
- # The export match descriptor differs slightly from the samtools match descriptor.
- # In order for the converted SAM files to be as compliant as possible,
- # we put the export match descriptor in optional field 'XD' rather than 'MD':
- push(@$s, "XD:Z:$matchDesc");
- push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
- push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
- }
-}
-
-
-
-#
-# the following code is taken from Richard Shaw's sorted2sam.pl file
-#
-sub reverse_compl_match_descriptor($)
-{
-# print "\nREVERSING THE MATCH DESCRIPTOR!\n";
- my ($match_desc) = @_;
- my $rev_compl_match_desc = reverse($match_desc);
- $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
-
- # Unreverse the digits of numbers.
- $rev_compl_match_desc = join('',
- map {($_ =~ /\d+/)
- ? join('', reverse(split('', $_)))
- : $_} split(/(\d+)/,
- $rev_compl_match_desc));
-
- return $rev_compl_match_desc;
-}
-
-
-
-sub match_desc_to_cigar($)
-{
- my ($match_desc) = @_;
-
- my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
- my $cigar_str = '';
- my $cigar_del_ch = 'D';
- my $cigar_ins_ch = 'I';
- my $cigar_match_ch = 'M';
-
- foreach my $match_desc_part (@match_desc_parts) {
- next if (!$match_desc_part);
-
- if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
- # Deletion
- $cigar_str .= (length($1) . $cigar_del_ch);
- } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
- # Insertion
- $cigar_str .= ($1 . $cigar_ins_ch);
- } else {
- $cigar_str .= (match_desc_frag_length($match_desc_part)
- . $cigar_match_ch);
- }
- }
-
- return $cigar_str;
-}
-
-
-#------------------------------------------------------------------------------
-
-sub match_desc_frag_length($)
- {
- my ($match_desc_str) = @_;
- my $len = 0;
-
- my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
-
- foreach my $match_desc_field (@match_desc_fields) {
- next if ($match_desc_field eq '');
-
- $len += (($match_desc_field =~ /(\d+)/)
- ? $1 : length($match_desc_field));
- }
-
- return $len;
-}
-
-
-# argument holds the command line
-sub write_header($;$;$)
-{
- my ($progname,$version,$cl) = @_;
- my $complete_header = "";
- $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
-
- return $complete_header;
-}
diff --git a/sam/misc/interpolate_sam.pl b/sam/misc/interpolate_sam.pl
deleted file mode 100755
index 6cd6831..0000000
--- a/sam/misc/interpolate_sam.pl
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/perl
-use strict;
-
-###Builds interpolated pileup from SAM file
-##@description counts bases between paired ends and piles up single end reads.
-##@output, uses a #header for the RNAME and then the number of reads per base
-##@author ***@sanger.ac.uk, Stephen B. Montgomery
-
-##@caveats
-##Requires RNAME to have format as per example
-## chromosome:NCBI36:18:1:76117153:1
-## supercontig::NT_113883:1:137703:1
-## clone::AC138827.3:1:149397:1
-##Expects simple CIGAR characters, M, I and D
-##Expects SAM file to be sorted.
-##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77)
-
-##Verify and read in SAM file
-my $sam_file = $ARGV[0];
-if(!defined($sam_file)) { die("No sam file defined on arg 1"); }
-unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); }
-open(SAM, $sam_file) || die("Cannot open sam file");
-
-##Globals
-my $current_location = ""; ##Current RNAME being processed
-my $current_size = 0; ##Size of sequence region being processed
-my $current_position = 1; ##Current base being processed
-my $open = 0; ##Number of open reads (PE reads that have not been closed)
-my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the
- ##contained value from those open and deletes the indexed position from the hash
-
-while (my $line = <SAM>) {
- my @tokens = split /\t/, $line;
-
- if ($current_location ne $tokens[2]) { ##Start a new sequence region
- for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region
- if (defined($close{$i})) {
- $open = $open - $close{$i};
- delete $close{$i};
- }
- print $open . "\n";
- }
- if ($current_location ne "") {
- print "\n";
- }
-
- ##Initiate a new sequence region
- my @location_tokens = split /:/, $tokens[2];
- $current_position = 1;
- $current_location = $tokens[2];
- $current_size = $location_tokens[4];
- $open = 0;
- %close = ();
- print "#" . $tokens[2] . "\n";
-
- ##Print pileup to just before the first read (will be 0)
- for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) {
- print $open . "\n";
- }
- $current_position = $tokens[3];
-
- } else { ##Sequence region already open
- if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position
- ##cycle through to catch up to the current position
- for (my $i = $current_position; $i < $tokens[3]; $i++) {
- if (defined($close{$i})) {
- $open = $open - $close{$i};
- delete $close{$i};
- }
- print $open . "\n";
- }
- $current_position = $tokens[3];
- }
- }
- $open++; ##Increment the number of open reads
-
- if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition
- $open--;
- my $parsed_cig = &parseCigar($tokens[5]);
- my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
- if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
- $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
- } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition
- my $parsed_cig = &parseCigar($tokens[5]);
- my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1;
- if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; }
- $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1;
- } else {
- #do nothing
- }
-}
-for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region
- if (defined($close{$i})) {
- $open = $open - $close{$i};
- delete $close{$i};
- }
- print $open . "\n";
-}
-print "\n";
-close(SAM);
-exit(0);
-
-##reads and tokenizes simple cigarline
-sub parseCigar() {
- my $cigar_line = shift;
- $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g;
- my @cigar_tokens = split /\t/, $cigar_line;
- my %parsed = ('M' => 0,
- 'I' => 0,
- 'D' => 0);
- my @events = ();
- for(my $i = 0; $i < scalar(@cigar_tokens); $i++) {
- if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) {
- if (!defined($parsed{$2})) { $parsed{$2} = 0; }
- my $nt = $2;
- if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; }
- $parsed{$nt} += $1;
- my %event_el = ("t" => $nt,
- "n" => $1);
- push @events, \%event_el;
- }
- }
- $parsed{'events'} = \@events;
- return \%parsed;
-}
diff --git a/sam/misc/maq2sam.c b/sam/misc/maq2sam.c
deleted file mode 100644
index 2bfbe2a..0000000
--- a/sam/misc/maq2sam.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <string.h>
-#include <zlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#define PACKAGE_VERSION "r439"
-
-//#define MAQ_LONGREADS
-
-#ifdef MAQ_LONGREADS
-# define MAX_READLEN 128
-#else
-# define MAX_READLEN 64
-#endif
-
-#define MAX_NAMELEN 36
-#define MAQMAP_FORMAT_OLD 0
-#define MAQMAP_FORMAT_NEW -1
-
-#define PAIRFLAG_FF 0x01
-#define PAIRFLAG_FR 0x02
-#define PAIRFLAG_RF 0x04
-#define PAIRFLAG_RR 0x08
-#define PAIRFLAG_PAIRED 0x10
-#define PAIRFLAG_DIFFCHR 0x20
-#define PAIRFLAG_NOMATCH 0x40
-#define PAIRFLAG_SW 0x80
-
-typedef struct
-{
- uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */
- uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual;
- uint32_t seqid, pos;
- int dist;
- char name[MAX_NAMELEN];
-} maqmap1_t;
-
-typedef struct
-{
- int format, n_ref;
- char **ref_name;
- uint64_t n_mapped_reads;
- maqmap1_t *mapped_reads;
-} maqmap_t;
-
-maqmap_t *maq_new_maqmap()
-{
- maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t));
- mm->format = MAQMAP_FORMAT_NEW;
- return mm;
-}
-void maq_delete_maqmap(maqmap_t *mm)
-{
- int i;
- if (mm == 0) return;
- for (i = 0; i < mm->n_ref; ++i)
- free(mm->ref_name[i]);
- free(mm->ref_name);
- free(mm->mapped_reads);
- free(mm);
-}
-maqmap_t *maqmap_read_header(gzFile fp)
-{
- maqmap_t *mm;
- int k, len;
- mm = maq_new_maqmap();
- gzread(fp, &mm->format, sizeof(int));
- if (mm->format != MAQMAP_FORMAT_NEW) {
- if (mm->format > 0) {
- fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n");
- exit(3);
- }
- assert(mm->format == MAQMAP_FORMAT_NEW);
- }
- gzread(fp, &mm->n_ref, sizeof(int));
- mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*));
- for (k = 0; k != mm->n_ref; ++k) {
- gzread(fp, &len, sizeof(int));
- mm->ref_name[k] = (char*)malloc(len * sizeof(char));
- gzread(fp, mm->ref_name[k], len);
- }
- /* read number of mapped reads */
- gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t));
- return mm;
-}
-
-void maq2tam_core(gzFile fp, const char *rg)
-{
- maqmap_t *mm;
- maqmap1_t mm1, *m1;
- int ret;
- m1 = &mm1;
- mm = maqmap_read_header(fp);
- while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) {
- int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1];
- if (m1->flag) flag |= 1;
- if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2;
- if (m1->flag == 192) flag |= 4;
- if (m1->flag == 64) flag |= 8;
- if (m1->pos&1) flag |= 0x10;
- if ((flag&1) && m1->dist != 0) {
- int c;
- if (m1->dist > 0) {
- if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0;
- else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1;
- else c = m1->pos&1;
- } else {
- if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0;
- else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1;
- else c = m1->pos&1;
- }
- if (c) flag |= 0x20;
- }
- if (m1->flag) {
- int l = strlen(m1->name);
- if (m1->name[l-2] == '/') {
- flag |= (m1->name[l-1] == '1')? 0x40 : 0x80;
- m1->name[l-2] = '\0';
- }
- }
- printf("%s\t%d\t", m1->name, flag);
- printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1);
- if (m1->flag == 130) {
- int c = (int8_t)m1->seq[MAX_READLEN-1];
- printf("%d\t", m1->alt_qual);
- if (c == 0) printf("%dM\t", m1->size);
- else {
- if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c);
- else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual);
- }
- se_mapq = 0; // zero SE mapQ for reads aligned by SW
- } else {
- if (flag&4) printf("0\t*\t");
- else printf("%d\t%dM\t", m1->map_qual, m1->size);
- }
- printf("*\t0\t%d\t", m1->dist);
- for (j = 0; j != m1->size; ++j) {
- if (m1->seq[j] == 0) putchar('N');
- else putchar("ACGT"[m1->seq[j]>>6&3]);
- }
- putchar('\t');
- for (j = 0; j != m1->size; ++j)
- putchar((m1->seq[j]&0x3f) + 33);
- putchar('\t');
- if (rg) printf("RG:Z:%s\t", rg);
- if (flag&4) { // unmapped
- printf("MF:i:%d\n", m1->flag);
- } else {
- printf("MF:i:%d\t", m1->flag);
- if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq);
- printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]);
- }
- }
- if (ret > 0)
- fprintf(stderr, "Truncated! Continue anyway.\n");
- maq_delete_maqmap(mm);
-}
-
-int main(int argc, char *argv[])
-{
- gzFile fp;
- if (argc == 1) {
- fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
- fprintf(stderr, "Usage: maq2sam <in.map> [<readGroup>]\n");
- return 1;
- }
- fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
- maq2tam_core(fp, argc > 2? argv[2] : 0);
- gzclose(fp);
- return 0;
-}
diff --git a/sam/misc/md5.c b/sam/misc/md5.c
deleted file mode 100644
index 55ae181..0000000
--- a/sam/misc/md5.c
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest. This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to MD5Init, call MD5Update as
- * needed on buffers full of bytes, and then call MD5Final, which
- * will fill a supplied 16-byte array with the digest.
- */
-
-/* Brutally hacked by John Walker back from ANSI C to K&R (no
- prototypes) to maintain the tradition that Netfone will compile
- with Sun's original "cc". */
-
-#include <string.h>
-#include "md5.h"
-
-#ifndef HIGHFIRST
-#define byteReverse(buf, len) /* Nothing */
-#else
-/*
- * Note: this code is harmless on little-endian machines.
- */
-void byteReverse(buf, longs)
- unsigned char *buf; unsigned longs;
-{
- uint32_t t;
- do {
- t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
- ((unsigned) buf[1] << 8 | buf[0]);
- *(uint32_t *) buf = t;
- buf += 4;
- } while (--longs);
-}
-#endif
-
-void MD5Transform(uint32_t buf[4], uint32_t in[16]);
-
-
-/*
- * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void MD5Init(ctx)
- struct MD5Context *ctx;
-{
- ctx->buf[0] = 0x67452301;
- ctx->buf[1] = 0xefcdab89;
- ctx->buf[2] = 0x98badcfe;
- ctx->buf[3] = 0x10325476;
-
- ctx->bits[0] = 0;
- ctx->bits[1] = 0;
-}
-
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void MD5Update(ctx, buf, len)
- struct MD5Context *ctx; unsigned char *buf; unsigned len;
-{
- uint32_t t;
-
- /* Update bitcount */
-
- t = ctx->bits[0];
- if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
- ctx->bits[1]++; /* Carry from low to high */
- ctx->bits[1] += len >> 29;
-
- t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
-
- /* Handle any leading odd-sized chunks */
-
- if (t) {
- unsigned char *p = (unsigned char *) ctx->in + t;
-
- t = 64 - t;
- if (len < t) {
- memcpy(p, buf, len);
- return;
- }
- memcpy(p, buf, t);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- buf += t;
- len -= t;
- }
- /* Process data in 64-byte chunks */
-
- while (len >= 64) {
- memcpy(ctx->in, buf, 64);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- buf += 64;
- len -= 64;
- }
-
- /* Handle any remaining bytes of data. */
-
- memcpy(ctx->in, buf, len);
-}
-
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void MD5Final(digest, ctx)
- unsigned char digest[16]; struct MD5Context *ctx;
-{
- unsigned count;
- unsigned char *p;
-
- /* Compute number of bytes mod 64 */
- count = (ctx->bits[0] >> 3) & 0x3F;
-
- /* Set the first char of padding to 0x80. This is safe since there is
- always at least one byte free */
- p = ctx->in + count;
- *p++ = 0x80;
-
- /* Bytes of padding needed to make 64 bytes */
- count = 64 - 1 - count;
-
- /* Pad out to 56 mod 64 */
- if (count < 8) {
- /* Two lots of padding: Pad the first block to 64 bytes */
- memset(p, 0, count);
- byteReverse(ctx->in, 16);
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
-
- /* Now fill the next block with 56 bytes */
- memset(ctx->in, 0, 56);
- } else {
- /* Pad block to 56 bytes */
- memset(p, 0, count - 8);
- }
- byteReverse(ctx->in, 14);
-
- /* Append length in bits and transform */
- ((uint32_t *) ctx->in)[14] = ctx->bits[0];
- ((uint32_t *) ctx->in)[15] = ctx->bits[1];
-
- MD5Transform(ctx->buf, (uint32_t *) ctx->in);
- byteReverse((unsigned char *) ctx->buf, 4);
- memcpy(digest, ctx->buf, 16);
- memset(ctx, 0, sizeof(ctx)); /* In case it's sensitive */
-}
-
-
-/* The four core functions - F1 is optimized somewhat */
-
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
- ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
-
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data. MD5Update blocks
- * the data and converts bytes into longwords for this routine.
- */
-void MD5Transform(buf, in)
- uint32_t buf[4]; uint32_t in[16];
-{
- register uint32_t a, b, c, d;
-
- a = buf[0];
- b = buf[1];
- c = buf[2];
- d = buf[3];
-
- MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
- MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
- MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
- MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
- MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
- MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
- MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
- MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
- MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
- MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
- MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
- MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
- MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
- MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
- MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
- MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
- MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
- MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
- MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
- MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
- MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
- MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
- MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
- MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
- MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
- MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
- MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
- MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
- MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
- MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
- MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
- MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
- MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
- MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
- MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
- MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
- MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
- MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
- MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
- MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
- MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
- MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
- MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
- MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
- MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
- MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
- MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
- MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
- MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
- MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
- MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
- MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
- MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
- MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
- MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
- MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
- MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
- MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
- MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
- MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
- MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
- MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
- MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
- MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
- buf[0] += a;
- buf[1] += b;
- buf[2] += c;
- buf[3] += d;
-}
-
-/* lh3: the following code is added by me */
-
-#ifdef MD5SUM_MAIN
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#define HEX_STR "0123456789abcdef"
-
-static void md5_one(const char *fn)
-{
- unsigned char buf[4096], digest[16];
- MD5_CTX md5;
- int l;
- FILE *fp;
-
- fp = strcmp(fn, "-")? fopen(fn, "r") : stdin;
- if (fp == 0) {
- fprintf(stderr, "md5sum: %s: No such file or directory\n", fn);
- exit(1);
- }
- MD5Init(&md5);
- while ((l = fread(buf, 1, 4096, fp)) > 0)
- MD5Update(&md5, buf, l);
- MD5Final(digest, &md5);
- if (fp != stdin) fclose(fp);
- for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
- printf(" %s\n", fn);
-}
-int main(int argc, char *argv[])
-{
- int i;
- if (argc == 1) md5_one("-");
- else for (i = 1; i < argc; ++i) md5_one(argv[i]);
- return 0;
-}
-#endif
diff --git a/sam/misc/md5.h b/sam/misc/md5.h
deleted file mode 100644
index 44121e4..0000000
--- a/sam/misc/md5.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- This file is adapted from a program in this page:
-
- http://www.fourmilab.ch/md5/
-
- The original source code does not work on 64-bit machines due to the
- wrong typedef "uint32". I also added prototypes.
-
- -lh3
- */
-
-#ifndef MD5_H
-#define MD5_H
-
-/* The following tests optimise behaviour on little-endian
- machines, where there is no need to reverse the byte order
- of 32 bit words in the MD5 computation. By default,
- HIGHFIRST is defined, which indicates we're running on a
- big-endian (most significant byte first) machine, on which
- the byteReverse function in md5.c must be invoked. However,
- byteReverse is coded in such a way that it is an identity
- function when run on a little-endian machine, so calling it
- on such a platform causes no harm apart from wasting time.
- If the platform is known to be little-endian, we speed
- things up by undefining HIGHFIRST, which defines
- byteReverse as a null macro. Doing things in this manner
- insures we work on new platforms regardless of their byte
- order. */
-
-#define HIGHFIRST
-
-#if __LITTLE_ENDIAN__ != 0
-#undef HIGHFIRST
-#endif
-
-#include <stdint.h>
-
-struct MD5Context {
- uint32_t buf[4];
- uint32_t bits[2];
- unsigned char in[64];
-};
-
-void MD5Init(struct MD5Context *ctx);
-void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len);
-void MD5Final(unsigned char digest[16], struct MD5Context *ctx);
-
-/*
- * This is needed to make RSAREF happy on some MS-DOS compilers.
- */
-typedef struct MD5Context MD5_CTX;
-
-/* Define CHECK_HARDWARE_PROPERTIES to have main,c verify
- byte order and uint32_t settings. */
-#define CHECK_HARDWARE_PROPERTIES
-
-#endif /* !MD5_H */
diff --git a/sam/misc/md5fa.c b/sam/misc/md5fa.c
deleted file mode 100644
index 7a165bf..0000000
--- a/sam/misc/md5fa.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <stdio.h>
-#include <zlib.h>
-#include "md5.h"
-#include "kseq.h"
-
-#define HEX_STR "0123456789abcdef"
-
-KSEQ_INIT(gzFile, gzread)
-
-static void md5_one(const char *fn)
-{
- MD5_CTX md5_one, md5_all;
- int l, i, k;
- gzFile fp;
- kseq_t *seq;
- unsigned char unordered[16], digest[16];
-
- for (l = 0; l < 16; ++l) unordered[l] = 0;
- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- if (fp == 0) {
- fprintf(stderr, "md5fa: %s: No such file or directory\n", fn);
- exit(1);
- }
-
- MD5Init(&md5_all);
- seq = kseq_init(fp);
- while ((l = kseq_read(seq)) >= 0) {
- for (i = k = 0; i < seq->seq.l; ++i) {
- if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
- else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
- }
- MD5Init(&md5_one);
- MD5Update(&md5_one, (unsigned char*)seq->seq.s, k);
- MD5Final(digest, &md5_one);
- for (l = 0; l < 16; ++l) {
- printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
- unordered[l] ^= digest[l];
- }
- printf(" %s %s\n", fn, seq->name.s);
- MD5Update(&md5_all, (unsigned char*)seq->seq.s, k);
- }
- MD5Final(digest, &md5_all);
- kseq_destroy(seq);
- for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]);
- printf(" %s >ordered\n", fn);
- for (l = 0; l < 16; ++l)
- printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]);
- printf(" %s >unordered\n", fn);
-}
-
-int main(int argc, char *argv[])
-{
- int i;
- if (argc == 1) md5_one("-");
- else for (i = 1; i < argc; ++i) md5_one(argv[i]);
- return 0;
-}
diff --git a/sam/misc/novo2sam.pl b/sam/misc/novo2sam.pl
deleted file mode 100755
index 8b53c9e..0000000
--- a/sam/misc/novo2sam.pl
+++ /dev/null
@@ -1,281 +0,0 @@
-#!/usr/bin/perl -w
-
-# Contact: lh3
-# Version: 0.1.3
-
-#Modified by Zayed Albertyn(***@gmail.com) & Colin Hercus(***@novocraft.com)
-
-#use strict;
-#use warnings;
-use Data::Dumper;
-use Getopt::Std;
-
-&novo2sam;
-exit;
-
-sub mating {
- my ($s1, $s2) = @_;
- my $isize = 0;
- if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
- my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
- my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
- $isize = $x2 - $x1;
- }
- # update mate coordinate
- if ($s2->[2] ne '*') {
- @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
- $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
- } else {
- $s1->[1] |= 0x8;
- }
- if ($s1->[2] ne '*') {
- @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
- $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
- } else {
- $s2->[1] |= 0x8;
- }
-}
-
-sub novo2sam {
- my %opts = ();
- getopts("p", \%opts);
- die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0);
- my $is_paired = defined($opts{p});
- # core loop
- my @s1 = ();
- my @s2 = ();
- my ($s_last, $s_curr) = (\@s1, \@s2);
- while (<>) {
- next if (/^#/);
- next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/);
- &novo2sam_aux($_, $s_curr, $is_paired);
- if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
- &mating($s_last, $s_curr);
- print join("\t", @$s_last), "\n";
- print join("\t", @$s_curr), "\n";
- @$s_last = (); @$s_curr = ();
- } else {
- print join("\t", @$s_last), "\n" if (@$s_last != 0);
- my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
- }
- }
- print join("\t", @$s_last), "\n" if (@$s_last != 0);
-}
-
-sub novo2sam_aux {
- my ($line, $s, $is_paired) = @_;
-
- chomp($line);
- my @t = split(/\s+/, $line);
- my @variations = @t[13 .. $#t];
- @$s = ();
- return if ($t[4] ne 'U');
- my $len = length($t[2]);
- # read name
- $s->[0] = substr($t[0], 1);
- $s->[0] =~ s/\/[12]$//g;
- # initial flag (will be updated later)
- $s->[1] = 0;
- $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7);
- $s->[1] |= 2 if ($t[10] eq '.');
- # read & quality
- if ($t[9] eq 'R') {
- $s->[9] = reverse($t[2]);
- $s->[10] = reverse($t[3]);
- $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
- } else {
- $s->[9] = $t[2]; $s->[10] = $t[3];
- }
- # cigar
- my $cigarstring ="";
- if (scalar @variations ==0 ) {
- $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment
- } else {
- #convert to correct CIGAR
- my $tmpstr = join" ",@variations ;
- if ( $tmpstr=~ /\+|\-/ ) {
- $cigarstring = cigar_method($line,\@variations,$len);
- $s->[5]=$cigarstring;
- } else {
- $s->[5]=$len. "M";
- }
-}
-
-# coor
- $s->[2] = substr($t[7], 1); $s->[3] = $t[8];
- $s->[1] |= 0x10 if ($t[9] eq 'R');
- # mapQ
- $s->[4] = $t[5] > $t[6]? $t[5] : $t[6];
- # mate coordinate
- $s->[6] = '*'; $s->[7] = $s->[8] = 0;
- # aux
- push(@$s, "NM:i:".(@t-13));
- my $md = '';
- $md = mdtag($md,$line,\@variations,$len);
- push(@$s, "MD:Z:$md");
-
-}
-
-sub mdtag {
- my $oldmd = shift;
- my $line = shift;
- my $ref =shift;
- my $rdlen = shift;
- my @variations = @$ref;
- my $string="";
- my $mdtag="";
- my $t=1;
- my $q=1;
- my $deleteflag=0;
- my $len =0;
- foreach $string (@variations) {
- my ($indeltype,$insert) = indeltype($string);
- if ($indeltype eq "+") {
- $len = length ($insert);
- $q+=$len;
- next;
- }
- my $pos = $1 if $string =~ /^(\d+)/;
- $len = $pos - $t;
- if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) {
- $mdtag.=$len;
- }
- $t+=$len;
- $q+=$len;
- if ($indeltype eq ">") {
- $mdtag.=$insert;
- $deleteflag=0;
- $t+=1;
- $q+=1;
- }
- if ($indeltype eq "-") {
- my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/;
- if ($deleteflag == 0 ) {
- $mdtag.="^";
- }
- $mdtag.=$deletedbase;
- $deleteflag=1;
- $t+=1;
- }
- }
- $len = $rdlen - $q + 1;
- if ($len > 0) {
- $mdtag.="$len";
- }
-# print "In:$line\n";
-# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n";
-
- return $mdtag;
-}
-
-sub indeltype {
- my $string = shift;
- my $insert="";
- my $indeltype;
- if ($string =~ /([A-Za-z]+)\>/) {
- $indeltype=">";
- $insert=$1;
- } elsif ($string =~ /\-/) {
- $indeltype="-";
- } elsif ($string =~ /\+([A-Za-z]+)/) {
- $indeltype="+";
- $insert=$1;
- }
- return ($indeltype,$insert);
-
-}
-
-
-sub cigar_method {
- my $line = shift;
- my $ref =shift;
- my $rdlen = shift;
- my @variations = @$ref;
- my $string="";
- my $type="";
- my $t =1;
- my $q=1;
- my $indeltype="";
- my $cigar= "";
- my $insert = "";
- my $len=0;
- my @cig=();
- foreach $string (@variations) {
- next if $string =~ />/;
- my $pos = $1 if $string =~ /^(\d+)/;
-
- if ($string =~ /\+([A-Za-z]+)/) {
- $indeltype="+";
- $insert = $1;
- }elsif ($string =~ /\-([A-Za-z]+)/) {
- $indeltype="-";
- $insert = $1;
- }
-#print "$pos $indeltype $insert $t $q\n";
- $len = $pos - $t;
- if ( $len > 0) {
- $cigar.=$len."M";
- push(@cig,$len."M");
- }
- $t+=$len;
- $q+=$len;
-
- if ($indeltype eq "-") {
- $cigar.="D";
- push(@cig,"D");
- $t++;
- }
- if ($indeltype eq "+") {
- $len = length ($insert);
- if ($len == 1) {
- $cigar.="I";
- push(@cig,"I");
- }
- if ($len > 1) {
- $cigar.=$len."I";
- push(@cig,$len."I")
- }
- $q+=$len;
- }
- $insert="";
- }
- $len= $rdlen - $q + 1;
- if ($len > 0) {
- $cigar.=$len."M";
- push(@cig,$len."M");
- }
-
- $cigar = newcigar($cigar,'D');
- $cigar = newcigar($cigar,'I');
-
- #print "$line\n";
- #print "c CIGAR:\t$cigar\n\n";
- return $cigar;
-
-}
-
-
-
-sub newcigar {
- my $cigar = shift;
- my $char = shift;
- my $new = "";
- my $copy = $cigar;
-#print "$cigar\n";
- $copy =~ s/^($char+)/$1;/g;
-#print "$copy\n";
- $copy =~ s/([^0-9$char])($char+)/$1;$2;/g;
-#print "$copy\n";
- my @parts = split(/;/,$copy);
- my $el="";
- foreach $el (@parts) {
-#print "$el\n";
- if ($el =~ /^$char+$/) {
- $new.=length($el).$char;
- }else {
- $new.=$el;
- }
-
- }
- return $new;
-}
diff --git a/sam/misc/plot-bamcheck b/sam/misc/plot-bamcheck
deleted file mode 100755
index 1792c6f..0000000
--- a/sam/misc/plot-bamcheck
+++ /dev/null
@@ -1,882 +0,0 @@
-#!/usr/bin/env perl
-#
-# Author: ***@sanger
-#
-
-use strict;
-use warnings;
-use Carp;
-
-my $opts = parse_params();
-parse_bamcheck($opts);
-plot_qualities($opts);
-plot_acgt_cycles($opts);
-plot_gc($opts);
-plot_gc_depth($opts);
-plot_isize($opts);
-plot_coverage($opts);
-plot_mismatches_per_cycle($opts);
-plot_indel_dist($opts);
-plot_indel_cycles($opts);
-
-exit;
-
-#--------------------------------
-
-sub error
-{
- my (@msg) = @_;
- if ( scalar @msg ) { confess @msg; }
- die
- "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n",
- " plot-bamcheck -p outdir/ file.bam.bc\n",
- "Options:\n",
- " -k, --keep-files Do not remove temporary files.\n",
- " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n",
- " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n",
- " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n",
- " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n",
- " -h, -?, --help This help message.\n",
- "\n";
-}
-
-
-sub parse_params
-{
- $0 =~ s{^.+/}{};
- my $opts = { args=>join(' ',$0,@ARGV) };
- while (defined(my $arg=shift(@ARGV)))
- {
- if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; }
- if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; }
- if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; }
- if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; }
- if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
- if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
- if ( -e $arg ) { $$opts{bamcheck}=$arg; next; }
- error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
- }
- if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; }
- if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") }
- if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") }
- if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; }
- elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; }
- return $opts;
-}
-
-
-# Creates GC stats for either the whole reference or only on target regions for exome QC
-sub do_ref_stats
-{
- my ($opts) = @_;
-
-
- my %targets = ();
- if ( exists($$opts{targets}) )
- {
- my ($prev_chr,$prev_pos);
- open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!");
- while (my $line=<$fh>)
- {
- if ( $line=~/^#/ ) { next; }
- my ($chr,$from,$to) = split(/\s+/,$line);
- chomp($to);
- push @{$targets{$chr}}, $from,$to;
- if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from }
- if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); }
- $prev_pos = $from;
- }
- close($fh);
- }
-
- my $_len = 60; # for now do only standard fasta's with 60 bases per line
- my %gc_counts = ();
- my ($skip_chr,$pos,$ireg,$regions);
- open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!");
- while (my $line=<$fh>)
- {
- if ( $line=~/^>/ )
- {
- if ( !scalar %targets ) { next; }
-
- if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); }
- if ( !exists($targets{$1}) ) { $skip_chr=$1; next; }
- undef $skip_chr;
- $pos = 0;
- $ireg = 0;
- $regions = $targets{$1};
- }
- if ( defined $skip_chr ) { next; }
-
- # Only $_len sized lines are considered and no chopping for target regions.
- chomp($line);
- my $len = length($line);
- if ( $len ne $_len ) { next; }
-
- if ( scalar %targets )
- {
- while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; }
- $pos += $len;
- if ( $ireg==@$regions ) { next; }
- if ( $pos < $$regions[$ireg] ) { next; }
- }
-
- my $gc_count = 0;
- for (my $i=0; $i<$len; $i++)
- {
- my $base = substr($line,$i,1);
- if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; }
- }
- $gc_counts{$gc_count}++;
- }
-
- print "# Generated by $$opts{args}\n";
- print "# The columns are: GC content bin, normalized frequency\n";
- my $max;
- for my $count (values %gc_counts)
- {
- if ( !defined $max or $count>$max ) { $max=$count; }
- }
- for my $gc (sort {$a<=>$b} keys %gc_counts)
- {
- if ( $gc==0 ) { next; }
- printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max;
- }
-}
-
-sub plot
-{
- my ($cmdfile) = @_;
- my $cmd = "gnuplot $cmdfile";
- system($cmd);
- if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); }
-}
-
-
-sub parse_bamcheck
-{
- my ($opts) = @_;
- open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!");
- my $line = <$fh>;
- if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); }
- $$opts{dat}{version} = $1;
- while ($line=<$fh>)
- {
- if ( $line=~/^#/ ) { next; }
- my @items = split(/\t/,$line);
- chomp($items[-1]);
- if ( $items[0] eq 'SN' )
- {
- $$opts{dat}{$items[1]} = splice(@items,2);
- next;
- }
- push @{$$opts{dat}{$items[0]}}, [splice(@items,1)];
- }
- close($fh);
-
- # Check sanity
- if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} )
- {
- error("Sanity check failed: no sequences found by bamcheck??\n");
- }
-}
-
-sub older_than
-{
- my ($opts,$version) = @_;
- my ($year,$month,$day) = split(/-/,$version);
- $version = $$opts{dat}{version};
- if ( !($version=~/$(\d+)-(\d+)-(\d+)$$/) ) { return 1; }
- if ( $1<$year ) { return 1; }
- elsif ( $1>$year ) { return 0; }
- if ( $2<$month ) { return 1; }
- elsif ( $2>$month ) { return 0; }
- if ( $3<$day ) { return 1; }
- return 0;
-}
-
-sub get_defaults
-{
- my ($opts,$img_fname,%args) = @_;
-
- if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); }
-
- # Determine the gnuplot script file name
- my $gp_file = $img_fname;
- $gp_file =~ s{\.[^.]+$}{.gp};
- if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; }
-
- # Determine the default title:
- # 5446_6/5446_6.bam.bc.gp -> 5446_6
- # test.aaa.png -> test.aaa
- if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); }
- my $title = $1;
-
- my $dir = $gp_file;
- $dir =~ s{/[^/]+$}{};
- if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; }
-
- my $wh = exists($args{wh}) ? $args{wh} : '600,400';
-
- open(my $fh,'>',$gp_file) or error("$gp_file: $!");
- return {
- title => $title,
- gp => $gp_file,
- img => $img_fname,
- fh => $fh,
- terminal => qq[set terminal png size $wh truecolor],
- grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"',
- };
-}
-
-sub percentile
-{
- my ($p,@vals) = @_;
- my $N = 0;
- for my $val (@vals) { $N += $val; }
- my $n = $p*($N+1)/100.;
- my $k = int($n);
- my $d = $n-$k;
- if ( $k<=0 ) { return 0; }
- if ( $k>=$N ) { return scalar @vals-1; }
- my $cnt;
- for (my $i=0; $i<@vals; $i++)
- {
- $cnt += $vals[$i];
- if ( $cnt>=$k ) { return $i; }
- }
- error("FIXME: this should not happen [percentile]\n");
-}
-
-sub plot_qualities
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; }
-
- my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50;
- my $is_paired = $$opts{dat}{'is paired:'};
-
- # Average quality per cycle, forward and reverse reads in one plot
- my $args = get_defaults($opts,"$$opts{prefix}quals.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set ylabel "Average Quality"
- set xlabel "Cycle"
- set yrange [0:$yrange]
- set title "$$args{title}"
- plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[
- ];
- my (@fp75,@fp50,@fmean);
- my (@lp75,@lp50,@lmean);
- my ($fmax,$fmax_qual,$fmax_cycle);
- my ($lmax,$lmax_qual,$lmax_cycle);
- for my $cycle (@{$$opts{dat}{FFQ}})
- {
- my $sum=0; my $n=0;
- for (my $iqual=1; $iqual<@$cycle; $iqual++)
- {
- $sum += $$cycle[$iqual]*$iqual;
- $n += $$cycle[$iqual];
- if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; }
- }
- my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
- my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
- my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
- if ( !$n ) { next; }
- push @fp75, "$$cycle[0]\t$p25\t$p75\n";
- push @fp50, "$$cycle[0]\t$p50\n";
- push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
- printf $fh $fmean[-1];
- }
- print $fh "end\n";
- if ( $is_paired )
- {
- for my $cycle (@{$$opts{dat}{LFQ}})
- {
- my $sum=0; my $n=0;
- for (my $iqual=1; $iqual<@$cycle; $iqual++)
- {
- $sum += $$cycle[$iqual]*$iqual;
- $n += $$cycle[$iqual];
- if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; }
- }
- my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
- my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
- my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
- if ( !$n ) { next; }
- push @lp75, "$$cycle[0]\t$p25\t$p75\n";
- push @lp50, "$$cycle[0]\t$p50\n";
- push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
- printf $fh $lmean[-1];
- }
- print $fh "end\n";
- }
- close($fh);
- plot($$args{gp});
-
-
-
- # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots
- $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500');
- $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set multiplot
- set rmargin 0
- set lmargin 0
- set tmargin 0
- set bmargin 0
- set origin 0.1,0.1
- set size 0.4,0.8
- set yrange [0:$yrange]
- set ylabel "Quality"
- set xlabel "Cycle (fwd reads)"
- plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean'
- ];
- print $fh join('',@fp75),"end\n";
- print $fh join('',@fp50),"end\n";
- print $fh join('',@fmean),"end\n";
- if ( $is_paired )
- {
- print $fh qq[
- set origin 0.55,0.1
- set size 0.4,0.8
- unset ytics
- set y2tics mirror
- set yrange [0:$yrange]
- unset ylabel
- set xlabel "Cycle (rev reads)"
- set label "$$args{title}" at screen 0.5,0.95 center
- plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean'
- ];
- print $fh join('',@lp75),"end\n";
- print $fh join('',@lp50),"end\n";
- print $fh join('',@lmean),"end\n";
- }
- close($fh);
- plot($$args{gp});
-
-
-
- # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve
- $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600');
- $fh = $$args{fh};
- my $nquals = @{$$opts{dat}{FFQ}[0]}-1;
- my $ncycles = @{$$opts{dat}{FFQ}};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set multiplot
- set rmargin 0
- set lmargin 0
- set tmargin 0
- set bmargin 0
- set origin 0.15,0.52
- set size 0.8,0.4
- set title "$$args{title}"
- set ylabel "Frequency (fwd reads)"
- set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax
- unset xlabel
- set xrange [0:$nquals]
- set format x ""
- ];
- my @plots;
- for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] }
- print $fh "plot ", join(",", @plots), "\n";
- for my $cycle (@{$$opts{dat}{FFQ}})
- {
- for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; }
- print $fh "end\n";
- }
- if ( $is_paired )
- {
- print $fh qq[
- set origin 0.15,0.1
- set size 0.8,0.4
- unset title
- unset format
- set xtics
- set xlabel "Quality"
- unset label
- set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax
- set ylabel "Frequency (rev reads)"
- ];
- print $fh "plot ", join(",", @plots), "\n";
- for my $cycle (@{$$opts{dat}{LFQ}})
- {
- for (my $iqual=1; $iqual<$nquals; $iqual++)
- {
- print $fh "$iqual\t$$cycle[$iqual]\n";
- }
- print $fh "end\n";
- }
- }
- close($fh);
- plot($$args{gp});
-
-
- # Heatmap qualitites
- $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500');
- $fh = $$args{fh};
- my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax;
- my @ytics;
- for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
- my $ytics = join(',', @ytics);
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- unset key
- unset colorbox
- set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1)
- set cbrange [0:$max]
- set yrange [0:$ncycles]
- set xrange [0:$nquals]
- set view map
- set multiplot
- set rmargin 0
- set lmargin 0
- set tmargin 0
- set bmargin 0
- set origin 0,0.46
- set size 0.95,0.6
- set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles
- set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black"
- set ylabel "Cycle (fwd reads)" offset character -1,0
- unset ytics
- set ytics ($ytics)
- unset xtics
- set title "$$args{title}"
- splot '-' matrix with image
- ];
- for my $cycle (@{$$opts{dat}{FFQ}})
- {
- for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
- print $fh "\n";
- }
- print $fh "end\nend\n";
- @ytics = ();
- for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
- $ytics = join(',', @ytics);
- print $fh qq[
- set origin 0,0.03
- set size 0.95,0.6
- set ylabel "Cycle (rev reads)" offset character -1,0
- set xlabel "Base Quality"
- unset title
- unset ytics
- set ytics ($ytics)
- set xrange [0:$nquals]
- set xtics
- set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812
- set cblabel "Number of bases"
- splot '-' matrix with image
- ];
- for my $cycle (@{$$opts{dat}{LFQ}})
- {
- for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
- print $fh "\n";
- }
- print $fh "end\nend\n";
- close($fh);
- plot($$args{gp});
-}
-
-
-sub plot_acgt_cycles
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; }
-
- my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set style line 1 linecolor rgb "green"
- set style line 2 linecolor rgb "red"
- set style line 3 linecolor rgb "black"
- set style line 4 linecolor rgb "blue"
- set style increment user
- set ylabel "Base content [%]"
- set xlabel "Read Cycle"
- set yrange [0:100]
- set title "$$args{title}"
- plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T'
- ];
- for my $base (1..4)
- {
- for my $cycle (@{$$opts{dat}{GCC}})
- {
- print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n";
- }
- print $fh "end\n";
- }
- close($fh);
- plot($$args{gp});
-}
-
-
-sub plot_gc
-{
- my ($opts) = @_;
-
- my $is_paired = $$opts{dat}{'is paired:'};
- my $args = get_defaults($opts,"$$opts{prefix}gc-content.png");
- my $fh = $$args{fh};
- my ($gcl_max,$gcf_max,$lmax,$fmax);
- for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } }
- for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } }
- my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax;
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set title "$$args{title}"
- set ylabel "Normalized Frequency"
- set xlabel "GC Content [%]"
- set yrange [0:1.1]
- set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0
- plot ]
- . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '')
- . q['-' smooth csplines with lines lc 1 title 'First fragments' ]
- . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '')
- . q[
- ];
- if ( exists($$opts{ref_stats}) )
- {
- open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!");
- while (my $line=<$ref>) { print $fh $line }
- close($ref);
- print $fh "end\n";
- }
- for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; }
- print $fh "end\n";
- if ( $is_paired )
- {
- for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; }
- print $fh "end\n";
- }
- close($fh);
- plot($$args{gp});
-}
-
-
-sub plot_gc_depth
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; }
-
- # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics.
- my @tics = ( {gc=>30},{gc=>40},{gc=>50} );
- for my $gc (@{$$opts{dat}{GCD}})
- {
- for my $tic (@tics)
- {
- my $diff = abs($$gc[0]-$$tic{gc});
- if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; }
- }
- }
-
- my @x2tics;
- for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; }
- my $x2tics = join(',',@x2tics);
-
- my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500');
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set ylabel "Mapped depth"
- set xlabel "Percentile of mapped sequence ordered by GC content"
- set x2label "GC Content [%]"
- set title "$$args{title}"
- set x2tics ($x2tics)
- set xtics nomirror
- set xrange [0.1:99.9]
-
- plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\
- '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\
- '-' using 1:2 with lines lc rgb "#0084ff" t 'Median'
- ];
- for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n";
- for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n";
- for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n";
- close($fh);
- plot($$args{gp});
-}
-
-
-sub plot_isize
-{
- my ($opts) = @_;
-
- if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; }
-
- my ($isize_max,$isize_cnt);
- for my $isize (@{$$opts{dat}{IS}})
- {
- if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; }
- }
-
- my $args = get_defaults($opts,"$$opts{prefix}insert-size.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set rmargin 5
- set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt
- set ylabel "Number of pairs"
- set xlabel "Insert Size"
- set title "$$args{title}"
- plot \\
- '-' with lines lc rgb 'black' title 'All pairs', \\
- '-' with lines title 'Inward', \\
- '-' with lines title 'Outward', \\
- '-' with lines title 'Other'
- ];
- for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n";
- for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n";
- for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n";
- for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n";
- close($fh);
- plot($$args{gp});
-}
-
-
-sub plot_coverage
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; }
-
- my @vals;
- for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; }
- my $i = percentile(99.8,@vals);
- my $p99 = $$opts{dat}{COV}[$i][1];
-
- my $args = get_defaults($opts,"$$opts{prefix}coverage.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set ylabel "Number of mapped bases"
- set xlabel "Coverage"
- set style fill solid border -1
- set title "$$args{title}"
- set xrange [:$p99]
- plot '-' with lines notitle
- ];
- for my $cov (@{$$opts{dat}{COV}})
- {
- if ( $$cov[2]==0 ) { next; }
- print $fh "$$cov[1]\t$$cov[2]\n";
- }
- print $fh "end\n";
- close($fh);
- plot($$args{gp});
-}
-
-
-sub plot_mismatches_per_cycle
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; }
- if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); }
-
- my $nquals = @{$$opts{dat}{MPC}[0]} - 2;
- my $ncycles = @{$$opts{dat}{MPC}};
- my ($style,$with);
- if ( $ncycles>100 ) { $style = ''; $with = 'w l'; }
- else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; }
-
- my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set style line 1 linecolor rgb "#e40000"
- set style line 2 linecolor rgb "#ff9f00"
- set style line 3 linecolor rgb "#eeee00"
- set style line 4 linecolor rgb "#4ebd68"
- set style line 5 linecolor rgb "#0061ff"
- set style increment user
- set key left top
- $style
- set ylabel "Number of mismatches"
- set xlabel "Read Cycle"
- set style fill solid border -1
- set title "$$args{title}"
- set xrange [-1:$ncycles]
- plot '-' $with ti 'Base Quality>30', \\
- '-' $with ti '30>=Q>20', \\
- '-' $with ti '20>=Q>10', \\
- '-' $with ti '10>=Q', \\
- '-' $with ti "N's"
- ];
- for my $cycle (@{$$opts{dat}{MPC}})
- {
- my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; }
- print $fh "$sum\n";
- }
- print $fh "end\n";
- for my $cycle (@{$$opts{dat}{MPC}})
- {
- my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; }
- print $fh "$sum\n";
- }
- print $fh "end\n";
- for my $cycle (@{$$opts{dat}{MPC}})
- {
- my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; }
- print $fh "$sum\n";
- }
- print $fh "end\n";
- for my $cycle (@{$$opts{dat}{MPC}})
- {
- my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; }
- print $fh "$sum\n";
- }
- print $fh "end\n";
- for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; }
- print $fh "end\n";
- close($fh);
- plot($$args{gp});
-}
-
-sub plot_indel_dist
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; }
-
- my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set style line 1 linetype 1 linecolor rgb "red"
- set style line 2 linetype 2 linecolor rgb "black"
- set style line 3 linetype 3 linecolor rgb "green"
- set style increment user
- set ylabel "Indel count [log]"
- set xlabel "Indel length"
- set y2label "Insertions/Deletions ratio"
- set log y
- set y2tics nomirror
- set ytics nomirror
- set title "$$args{title}"
- plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio"
- ];
- for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
- for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
- for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n";
- close($fh);
- plot($$args{gp});
-}
-
-sub plot_indel_cycles
-{
- my ($opts) = @_;
-
- if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; }
-
- my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png");
- my $fh = $$args{fh};
- print $fh qq[
- $$args{terminal}
- set output "$$args{img}"
- $$args{grid}
- set style line 1 linetype 1 linecolor rgb "red"
- set style line 2 linetype 2 linecolor rgb "black"
- set style line 3 linetype 3 linecolor rgb "green"
- set style line 4 linetype 4 linecolor rgb "blue"
- set style increment user
- set ylabel "Indel count"
- set xlabel "Read Cycle"
- set title "$$args{title}"
- plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)'
- ];
- for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
- for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
- for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n";
- for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
- close($fh);
- plot($$args{gp});
-}
-
-
-
-
-
-
-
-sub has_values
-{
- my ($opts,@tags) = @_;
- for my $tag (@tags)
- {
- my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`;
- chomp($lines[0]);
- if ( $lines[0]<2 ) { return 0; }
- }
- return 1;
-}
-
-sub plot_mismatches_per_cycle_old
-{
- my ($opts) = @_;
-
- my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
- my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`;
- my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`;
- chomp($nquals);
- chomp($ncycles);
- $nquals--;
- $ncycles--;
- my @gr0_15 = (2..17);
- my @gr16_30 = (18..32);
- my @gr31_n = (33..$nquals);
- my $gr0_15 = '$'. join('+$',@gr0_15);
- my $gr16_30 = '$'. join('+$',@gr16_30);
- my $gr31_n = '$'. join('+$',@gr31_n);
-
- open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!");
- print $fh q[
- set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
- set output "] . $$args{img} . q["
-
- set key left top
- set style data histogram
- set style histogram rowstacked
-
- set grid back lc rgb "#aaaaaa"
- set ylabel "Number of mismatches"
- set xlabel "Read Cycle"
- set style fill solid border -1
- set title "] . $$args{title} . qq["
- set xrange [-1:$ncycles]
-
- plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q'
- ];
- close($fh);
-
- plot($$args{gp});
-}
-
-
diff --git a/sam/misc/psl2sam.pl b/sam/misc/psl2sam.pl
deleted file mode 100755
index a96a6de..0000000
--- a/sam/misc/psl2sam.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-
-# Author: lh3
-
-# This script calculates a score using the BLAST scoring
-# system. However, I am not sure how to count gap opens and gap
-# extensions. It seems to me that column 5-8 are not what I am
-# after. This script counts gaps from the last three columns. It does
-# not generate reference skip (N) in the CIGAR as it is not easy to
-# directly tell which gaps correspond to introns.
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-my %opts = (a=>1, b=>3, q=>5, r=>2);
-getopts('a:b:q:r:', \%opts);
-die("Usage: psl2sam.pl [-a $opts{a}] [-b $opts{b}] [-q $opts{q}] [-r $opts{r}] <in.psl>\n") if (@ARGV == 0 && -t STDIN);
-
-my @stack;
-my $last = '';
-my ($a, $b, $q, $r) = ($opts{a}, $opts{b}, $opts{q}, $opts{r});
-while (<>) {
- next unless (/^\d/);
- my @t = split;
- my @s;
- my $cigar = '';
- if ($t[8] eq '-') {
- my $tmp = $t[11];
- $t[11] = $t[10] - $t[12];
- $t[12] = $t[10] - $tmp;
- }
- @s[0..4] = ($t[9], (($t[8] eq '+')? 0 : 16), $t[13], $t[15]+1, 0);
- @s[6..10] = ('*', 0, 0, '*', '*');
- $cigar .= $t[11].'H' if ($t[11]); # 5'-end clipping
- my @x = split(',', $t[18]);
- my @y = split(',', $t[19]);
- my @z = split(',', $t[20]);
- my ($y0, $z0) = ($y[0], $z[0]);
- my ($gap_open, $gap_ext) = (0, 0, 0);
- for (1 .. $t[17]-1) {
- my $ly = $y[$_] - $y[$_-1] - $x[$_-1];
- my $lz = $z[$_] - $z[$_-1] - $x[$_-1];
- if ($ly < $lz) { # del: the reference gap is longer
- ++$gap_open;
- $gap_ext += $lz - $ly;
- $cigar .= ($y[$_] - $y0) . 'M';
- $cigar .= ($lz - $ly) . 'D';
- ($y0, $z0) = ($y[$_], $z[$_]);
- } elsif ($lz < $ly) { # ins: the query gap is longer
- ++$gap_open;
- $gap_ext += $ly - $lz;
- $cigar .= ($z[$_] - $z0) . 'M';
- $cigar .= ($ly - $lz) . 'I';
- ($y0, $z0) = ($y[$_], $z[$_]);
- }
- }
- $cigar .= ($t[12] - $y0) . 'M';
- $cigar .= ($t[10] - $t[12]).'H' if ($t[10] != $t[12]); # 3'-end clipping
- $s[5] = $cigar;
- my $score = $a * $t[0] - $b * $t[1] - $q * $gap_open - $r * $gap_ext;
- $score = 0 if ($score < 0);
- $s[11] = "AS:i:$score";
- print join("\t", @s), "\n";
-}
diff --git a/sam/misc/r2plot.lua b/sam/misc/r2plot.lua
deleted file mode 100755
index 0a1b9f1..0000000
--- a/sam/misc/r2plot.lua
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env luajit
-
-function string:split(sep, n)
- local a, start = {}, 1;
- sep = sep or "%s+";
- repeat
- local b, e = self:find(sep, start);
- if b == nil then
- table.insert(a, self:sub(start));
- break
- end
- a[#a+1] = self:sub(start, b - 1);
- start = e + 1;
- if n and #a == n then
- table.insert(a, self:sub(start));
- break
- end
- until start > #self;
- return a;
-end
-
-function io.xopen(fn, mode)
- mode = mode or 'r';
- if fn == nil then return io.stdin;
- elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
- elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
- elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
- else return io.open(fn, mode) end
-end
-
-local eps = {};
-
-function eps.func(fp)
- fp = fp or io.stdout
- fp:write("/C { dup 255 and 255 div exch dup -8 bitshift 255 and 255 div 3 1 roll -16 bitshift 255 and 255 div 3 1 roll setrgbcolor } bind def\n")
- fp:write("/L { 4 2 roll moveto lineto } bind def\n")
- fp:write("/LX { dup 4 -1 roll exch moveto lineto } bind def\n")
- fp:write("/LY { dup 4 -1 roll moveto exch lineto } bind def\n")
- fp:write("/LS { 3 1 roll moveto show } bind def\n")
- fp:write("/RS { dup stringwidth pop 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n")
- fp:write("/B { 4 copy 3 1 roll exch 6 2 roll 8 -2 roll moveto lineto lineto lineto closepath } bind def\n")
-end
-
-function eps.font(ft, size, fp)
- fp = fp or io.stdout
- fp:write(string.format('/FS %d def\n', size));
- fp:write('/FS4 FS 4 div def\n');
- fp:write('/' .. ft .. ' findfont FS scalefont setfont\n');
-end
-
-local scale = 8;
-
-if #arg == 0 then
- print("Usage: r2plot.lua <in.txt>");
- os.exit(1)
-end
-
-local fp = io.xopen(arg[1]);
-local n = tonumber(fp:read());
-
-print('%!PS-Adobe-3.0 EPSF-3.0');
-print('%%' .. string.format('BoundingBox: -%d -%d %.3f %.3f\n', 10*scale, scale, (n+1)*scale, (n+1)*scale));
-print(string.format('%.3f setlinewidth', scale));
-print(string.format('/plot { setgray moveto 0 %d rlineto } def', scale));
-print(string.format('/plothalf { setgray moveto 0 %.2f rlineto } def', scale/2));
-eps.func();
-eps.font('Helvetica', scale-1);
-
-local i = 1;
-for l in fp:lines() do
- local t = l:split('\t');
- print(string.format("%d %d FS4 add (%s) RS", (i-1)*scale-2, (i-1)*scale, t[1]));
- for j = 2, #t do
- if tonumber(t[j]) > 0.01 then
- print(string.format('%.2f %.2f %.2f plot stroke', (i-1+.5)*scale, (j-2)*scale, 1.-t[j]));
- end
- end
- i = i + 1;
-end
-for j = 1, 21 do
- print(string.format('%.2f %.2f %.2f plothalf stroke', -8*scale, (j-1) * scale/2, 1.-(j-1)/20));
-end
-print('showpage');
diff --git a/sam/misc/sam2vcf.pl b/sam/misc/sam2vcf.pl
deleted file mode 100755
index afaf91e..0000000
--- a/sam/misc/sam2vcf.pl
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/perl -w
-#
-# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
-#
-# Contact: ***@sanger
-# Version: 2010-04-23
-
-use strict;
-use warnings;
-use Carp;
-
-my $opts = parse_params();
-do_pileup_to_vcf($opts);
-
-exit;
-
-#---------------
-
-sub error
-{
- my (@msg) = @_;
- if ( scalar @msg ) { croak(@msg); }
- die
- "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
- "Options:\n",
- " -h, -?, --help This help message.\n",
- " -i, --indels-only Ignore SNPs.\n",
- " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n",
- " -R, --keep-ref Print reference alleles as well.\n",
- " -s, --snps-only Ignore indels.\n",
- " -t, --column-title <string> The column title.\n",
- "\n";
-}
-
-
-sub parse_params
-{
- my %opts = ();
-
- $opts{fh_in} = *STDIN;
- $opts{fh_out} = *STDOUT;
-
- while (my $arg=shift(@ARGV))
- {
- if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; }
- if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
- if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; }
- if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
- if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; }
- if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
-
- error("Unknown parameter \"$arg\". Run -h for help.\n");
- }
- return \%opts;
-}
-
-sub iupac_to_gtype
-{
- my ($ref,$base) = @_;
- my %iupac = (
- 'K' => ['G','T'],
- 'M' => ['A','C'],
- 'S' => ['C','G'],
- 'R' => ['A','G'],
- 'W' => ['A','T'],
- 'Y' => ['C','T'],
- );
- if ( !exists($iupac{$base}) )
- {
- if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
- if ( $ref eq $base ) { return ('.','0/0'); }
- return ($base,'1/1');
- }
- my $gt = $iupac{$base};
- if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); }
- elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
- return ("$$gt[0],$$gt[1]",'1/2');
-}
-
-
-sub parse_indel
-{
- my ($cons) = @_;
- if ( $cons=~/^-/ )
- {
- my $len = length($');
- return "D$len";
- }
- elsif ( $cons=~/^\+/ ) { return "I$'"; }
- elsif ( $cons eq '*' ) { return undef; }
- error("FIXME: could not parse [$cons]\n");
-}
-
-
-# An example of the pileup format:
-# 1 3000011 C C 32 0 98 1 ^~, A
-# 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0
-# 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0
-# 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0
-#
-sub do_pileup_to_vcf
-{
- my ($opts) = @_;
-
- my $fh_in = $$opts{fh_in};
- my $fh_out = $$opts{fh_out};
- my ($prev_chr,$prev_pos,$prev_ref);
- my $refseq;
- my $ignore_indels = $$opts{snps_only} ? 1 : 0;
- my $ignore_snps = $$opts{indels_only} ? 1 : 0;
- my $keep_ref = $$opts{keep_ref} ? 1 : 0;
- my $title = exists($$opts{title}) ? $$opts{title} : 'data';
-
- print $fh_out
- qq[##fileformat=VCFv3.3\n],
- qq[##INFO=DP,1,Integer,"Total Depth"\n],
- qq[##FORMAT=GT,1,String,"Genotype"\n],
- qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n],
- qq[##FORMAT=DP,1,Integer,"Read Depth"\n],
- qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n]
- ;
-
- while (my $line=<$fh_in>)
- {
- chomp($line);
- my (@items) = split(/\t/,$line);
- if ( scalar @items<8 )
- {
- error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n");
- }
- my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items;
- $ref = uc($ref);
- $cons = uc($cons);
-
- my ($alt,$gt);
- if ( $ref eq '*' )
- {
- # An indel is involved.
- if ( $ignore_indels )
- {
- $prev_ref = $ref;
- $prev_pos = $pos;
- $prev_chr = $chr;
- next;
- }
-
- if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos)
- {
- if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
- if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
- $ref = $refseq->get_base($chr,$pos);
- $ref = uc($ref);
- }
- else { $ref = $prev_ref; }
-
- # One of the alleles can be a reference and it can come in arbitrary order. In some
- # cases */* can be encountered. In such a case, look in the additional columns.
- my ($al1,$al2) = split(m{/},$cons);
- if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; }
- my $alt1 = parse_indel($al1);
- my $alt2 = parse_indel($al2);
- if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); }
- if ( !$alt1 )
- {
- $alt=$alt2;
- $gt='0/1';
- }
- elsif ( !$alt2 )
- {
- $alt=$alt1;
- $gt='0/1';
- }
- elsif ( $alt1 eq $alt2 )
- {
- $alt="$alt1";
- $gt='1/1';
- }
- else
- {
- $alt="$alt1,$alt2";
- $gt='1/2';
- }
- }
- else
- {
- if ( $ignore_snps || (!$keep_ref && $ref eq $cons) )
- {
- $prev_ref = $ref;
- $prev_pos = $pos;
- $prev_chr = $chr;
- next;
- }
-
- # SNP
- ($alt,$gt) = iupac_to_gtype($ref,$cons);
- }
-
- print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
-
- $prev_ref = $ref;
- $prev_pos = $pos;
- $prev_chr = $chr;
- }
-}
-
-
-#------------- Fasta --------------------
-#
-# Uses samtools to get a requested base from a fasta file. For efficiency, preloads
-# a chunk to memory. The size of the cached sequence can be controlled by the 'size'
-# parameter.
-#
-package Fasta;
-
-use strict;
-use warnings;
-use Carp;
-
-sub Fasta::new
-{
- my ($class,@args) = @_;
- my $self = {@args};
- bless $self, ref($class) || $class;
- if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
- $$self{chr} = undef;
- $$self{from} = undef;
- $$self{to} = undef;
- if ( !$$self{size} ) { $$self{size}=10_000_000; }
- bless $self, ref($class) || $class;
- return $self;
-}
-
-sub read_chunk
-{
- my ($self,$chr,$pos) = @_;
- my $to = $pos + $$self{size};
- my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
- my @out = `$cmd`;
- if ( $? ) { $self->throw("$cmd: $!"); }
- my $line = shift(@out);
- if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
- $$self{chr} = $chr;
- $$self{from} = $1;
- $$self{to} = $2;
- my $chunk = '';
- while ($line=shift(@out))
- {
- chomp($line);
- $chunk .= $line;
- }
- $$self{chunk} = $chunk;
- return;
-}
-
-sub get_base
-{
- my ($self,$chr,$pos) = @_;
- if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
- {
- $self->read_chunk($chr,$pos);
- }
- my $idx = $pos - $$self{from};
- return substr($$self{chunk},$idx,1);
-}
-
-sub throw
-{
- my ($self,@msg) = @_;
- croak(@msg);
-}
diff --git a/sam/misc/samtools.pl b/sam/misc/samtools.pl
deleted file mode 100755
index d03c1c7..0000000
--- a/sam/misc/samtools.pl
+++ /dev/null
@@ -1,528 +0,0 @@
-#!/usr/bin/perl -w
-
-# Author: lh3
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-my $version = '0.3.3';
-&usage if (@ARGV < 1);
-
-my $command = shift(@ARGV);
-my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, plp2vcf=>\&plp2vcf,
- unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq);
-
-die("Unknown command \"$command\".\n") if (!defined($func{$command}));
-&{$func{$command}};
-exit(0);
-
-#
-# showALEN
-#
-
-sub showALEN {
- die(qq/Usage: samtools.pl showALEN <in.sam>\n/) if (@ARGV == 0 && -t STDIN);
- while (<>) {
- my @t = split;
- next if (/^\@/ || @t < 11);
- my $l = 0;
- $_ = $t[5];
- s/(\d+)[MI]/$l+=$1/eg;
- print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n";
- }
-}
-
-#
-# varFilter
-#
-
-#
-# Filtration code:
-#
-# d low depth
-# D high depth
-# W too many SNPs in a window (SNP only)
-# G close to a high-quality indel (SNP only)
-# Q low RMS mapping quality (SNP only)
-# g close to another indel with higher quality (indel only)
-# s low SNP quality (SNP only)
-# i low indel quality (indel only)
-
-sub varFilter {
- my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, S=>'', i=>'');
- getopts('pq:d:D:l:Q:w:W:N:G:S:i:', \%opts);
- die(qq/
-Usage: samtools.pl varFilter [options] <in.cns-pileup>
-
-Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
- -q INT minimum RMS mapping quality for gaps [$opts{q}]
- -d INT minimum read depth [$opts{d}]
- -D INT maximum read depth [$opts{D}]
- -S INT minimum SNP quality [$opts{S}]
- -i INT minimum indel quality [$opts{i}]
-
- -G INT min indel score for nearby SNP filtering [$opts{G}]
- -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
-
- -W INT window size for filtering dense SNPs [$opts{W}]
- -N INT max number of SNPs in a window [$opts{N}]
-
- -l INT window size for filtering adjacent gaps [$opts{l}]
-
- -p print filtered variants
-\n/) if (@ARGV == 0 && -t STDIN);
-
- # calculate the window size
- my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W});
- my $max_dist = $ol > $ow? $ol : $ow;
- $max_dist = $oW if ($max_dist < $oW);
- # the core loop
- my @staging; # (indel_filtering_score, flt_tag)
- while (<>) {
- my @t = split;
- next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites
- # clear the out-of-range elements
- while (@staging) {
- # Still on the same chromosome and the first element's window still affects this position?
- last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
- varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
- }
- my ($flt, $score) = (0, -1);
- # first a simple filter
- if ($t[7] < $opts{d}) {
- $flt = 2;
- } elsif ($t[7] > $opts{D}) {
- $flt = 3;
- }
- if ($t[2] eq '*') { # an indel
- if ($opts{i} && $opts{i}>$t[5]) { $flt = 8; }
- }
- elsif ($opts{S} && $opts{S}>$t[5]) { $flt = 7; } # SNP
-
- # site dependent filters
- my $len=0;
- if ($flt == 0) {
- if ($t[2] eq '*') { # an indel
- # If deletion, remember the length of the deletion
- my ($a,$b) = split(m{/},$t[3]);
- my $alen = length($a) - 1;
- my $blen = length($b) - 1;
- if ( $alen>$blen )
- {
- if ( substr($a,0,1) eq '-' ) { $len=$alen; }
- }
- elsif ( substr($b,0,1) eq '-' ) { $len=$blen; }
-
- $flt = 1 if ($t[6] < $opts{q});
- # filtering SNPs
- if ($t[5] >= $opts{G}) {
- for my $x (@staging) {
- # Is it a SNP and is it outside the SNP filter window?
- next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]);
- $x->[1] = 5 if ($x->[1] == 0);
- }
- }
- # calculate the filtering score (different from indel quality)
- $score = $t[5];
- $score += $opts{s} * $t[10] if ($t[8] ne '*');
- $score += $opts{s} * $t[11] if ($t[9] ne '*');
- # check the staging list for indel filtering
- for my $x (@staging) {
- # Is it a SNP and is it outside the gap filter window
- next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]);
- if ($x->[0] < $score) {
- $x->[1] = 6;
- } else {
- $flt = 6; last;
- }
- }
- } else { # a SNP
- $flt = 1 if ($t[6] < $opts{Q});
- # check adjacent SNPs
- my $k = 1;
- for my $x (@staging) {
- ++$k if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5));
- }
- # filtering is necessary
- if ($k > $opts{N}) {
- $flt = 4;
- for my $x (@staging) {
- $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0);
- }
- } else { # then check gap filter
- for my $x (@staging) {
- next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]);
- if ($x->[0] >= $opts{G}) {
- $flt = 5; last;
- }
- }
- }
- }
- }
- push(@staging, [$score, $flt, $len, @t]);
- }
- # output the last few elements in the staging list
- while (@staging) {
- varFilter_aux(shift @staging, $opts{p});
- }
-}
-
-sub varFilter_aux {
- my ($first, $is_print) = @_;
- if ($first->[1] == 0) {
- print join("\t", @$first[3 .. @$first-1]), "\n";
- } elsif ($is_print) {
- print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
- }
-}
-
-#
-# pileup2fq
-#
-
-sub pileup2fq {
- my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10);
- getopts('d:D:Q:G:l:', \%opts);
- die(qq/
-Usage: samtools.pl pileup2fq [options] <in.cns-pileup>
-
-Options: -d INT minimum depth [$opts{d}]
- -D INT maximum depth [$opts{D}]
- -Q INT min RMS mapQ [$opts{Q}]
- -G INT minimum indel score [$opts{G}]
- -l INT indel filter winsize [$opts{l}]\n
-/) if (@ARGV == 0 && -t STDIN);
-
- my ($last_chr, $seq, $qual, @gaps, $last_pos);
- my $_Q = $opts{Q};
- my $_d = $opts{d};
- my $_D = $opts{D};
-
- $last_chr = '';
- while (<>) {
- my @t = split;
- if ($last_chr ne $t[0]) {
- &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
- $last_chr = $t[0];
- $last_pos = 0;
- $seq = ''; $qual = '';
- @gaps = ();
- }
- if ($t[1] - $last_pos != 1) {
- $seq .= 'n' x ($t[1] - $last_pos - 1);
- $qual .= '!' x ($t[1] - $last_pos - 1);
- }
- if ($t[2] eq '*') {
- push(@gaps, $t[1]) if ($t[5] >= $opts{G});
- } else {
- $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]);
- my $q = $t[4] + 33;
- $q = 126 if ($q > 126);
- $qual .= chr($q);
- }
- $last_pos = $t[1];
- }
- &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
-}
-
-sub p2q_post_process {
- my ($chr, $seq, $qual, $gaps, $l) = @_;
- &p2q_filter_gaps($seq, $gaps, $l);
- print "\@$chr\n"; &p2q_print_str($seq);
- print "+\n"; &p2q_print_str($qual);
-}
-
-sub p2q_filter_gaps {
- my ($seq, $gaps, $l) = @_;
- for my $g (@$gaps) {
- my $x = $g > $l? $g - $l : 0;
- substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l));
- }
-}
-
-sub p2q_print_str {
- my ($s) = @_;
- my $l = length($$s);
- for (my $i = 0; $i < $l; $i += 60) {
- print substr($$s, $i, 60), "\n";
- }
-}
-
-#
-# sam2fq
-#
-
-sub sam2fq {
- my %opts = (n=>20, p=>'');
- getopts('n:p:', \%opts);
- die("Usage: samtools.pl sam2fq [-n 20] [-p <prefix>] <inp.sam>\n") if (@ARGV == 0 && -t STDIN);
- if ($opts{p} && $opts{n} > 1) {
- my $pre = $opts{p};
- my @fh;
- for (0 .. $opts{n}-1) {
- open($fh[$_], sprintf("| gzip > $pre.%.3d.fq.gz", $_)) || die;
- }
- my $i = 0;
- while (<>) {
- next if (/^@/);
- chomp;
- my @t = split("\t");
- next if ($t[9] eq '*');
- my ($name, $seq, $qual);
- if ($t[1] & 16) { # reverse strand
- $seq = reverse($t[9]);
- $qual = reverse($t[10]);
- $seq =~ tr/ACGTacgt/TGCAtgca/;
- } else {
- ($seq, $qual) = @t[9,10];
- }
- $name = $t[0];
- $name .= "/1" if ($t[1] & 0x40);
- $name .= "/2" if ($t[1] & 0x80);
- print {$fh[$i]} "\@$name\n$seq\n";
- if ($qual ne '*') {
- print {$fh[$i]} "+\n$qual\n";
- }
- $i = 0 if (++$i == $opts{n});
- }
- close($fh[$_]) for (0 .. $opts{n}-1);
- } else {
- die("To be implemented.\n");
- }
-}
-
-#
-# sra2hdr
-#
-
-# This subroutine does not use an XML parser. It requires that the SRA
-# XML files are properly formated.
-sub sra2hdr {
- my %opts = ();
- getopts('', \%opts);
- die("Usage: samtools.pl sra2hdr <SRA.prefix>\n") if (@ARGV == 0);
- my $pre = $ARGV[0];
- my $fh;
- # read sample
- my $sample = 'UNKNOWN';
- open($fh, "$pre.sample.xml") || die;
- while (<$fh>) {
- $sample = $1 if (/<SAMPLE.*alias="([^"]+)"/i);
- }
- close($fh);
- # read experiment
- my (%exp2lib, $exp);
- open($fh, "$pre.experiment.xml") || die;
- while (<$fh>) {
- if (/<EXPERIMENT.*accession="([^\s"]+)"/i) {
- $exp = $1;
- } elsif (/<LIBRARY_NAME>\s*(\S+)\s*<\/LIBRARY_NAME>/i) {
- $exp2lib{$exp} = $1;
- }
- }
- close($fh);
- # read run
- my ($run, @fn);
- open($fh, "$pre.run.xml") || die;
- while (<$fh>) {
- if (/<RUN.*accession="([^\s"]+)"/i) {
- $run = $1; @fn = ();
- } elsif (/<EXPERIMENT_REF.*accession="([^\s"]+)"/i) {
- print "\@RG\tID:$run\tSM:$sample\tLB:$exp2lib{$1}\n";
- } elsif (/<FILE.*filename="([^\s"]+)"/i) {
- push(@fn, $1);
- } elsif (/<\/RUN>/i) {
- if (@fn == 1) {
- print STDERR "$fn[0]\t$run\n";
- } else {
- for (0 .. $#fn) {
- print STDERR "$fn[$_]\t$run", "_", $_+1, "\n";
- }
- }
- }
- }
- close($fh);
-}
-
-#
-# unique
-#
-
-sub unique {
- my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3);
- getopts('Qf:q:r:a:b:m', \%opts);
- die("Usage: samtools.pl unique [-f $opts{f}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
- my $last = '';
- my $recal_Q = !defined($opts{Q});
- my $multi_only = defined($opts{m});
- my @a;
- while (<>) {
- my $score = -1;
- print $_ if (/^\@/);
- $score = $1 if (/AS:i:(\d+)/);
- my @t = split("\t");
- next if (@t < 11);
- if ($score < 0) { # AS tag is unavailable
- my $cigar = $t[5];
- my ($mm, $go, $ge) = (0, 0, 0);
- $cigar =~ s/(\d+)[ID]/++$go,$ge+=$1/eg;
- $cigar = $t[5];
- $cigar =~ s/(\d+)M/$mm+=$1/eg;
- $score = $mm * $opts{a} - $go * $opts{q} - $ge * $opts{r}; # no mismatches...
- }
- $score = 1 if ($score < 1);
- if ($t[0] ne $last) {
- &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a);
- $last = $t[0];
- }
- push(@a, [$score, \@t]);
- }
- &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a);
-}
-
-sub unique_aux {
- my ($a, $fac, $is_recal, $multi_only) = @_;
- my ($max, $max2, $max_i) = (0, 0, -1);
- for (my $i = 0; $i < @$a; ++$i) {
- if ($a->[$i][0] > $max) {
- $max2 = $max; $max = $a->[$i][0]; $max_i = $i;
- } elsif ($a->[$i][0] > $max2) {
- $max2 = $a->[$i][0];
- }
- }
- if ($is_recal) {
- if (!$multi_only || @$a > 1) {
- my $q = int($fac * ($max - $max2) / $max + .499);
- $q = 250 if ($q > 250);
- $a->[$max_i][1][4] = $q < 250? $q : 250;
- }
- }
- print join("\t", @{$a->[$max_i][1]});
- @$a = ();
-}
-
-#
-# uniqcmp: compare two SAM files
-#
-
-sub uniqcmp {
- my %opts = (q=>10, s=>100);
- getopts('pq:s:', \%opts);
- die("Usage: samtools.pl uniqcmp <in1.sam> <in2.sam>\n") if (@ARGV < 2);
- my ($fh, %a);
- warn("[uniqcmp] read the first file...\n");
- &uniqcmp_aux($ARGV[0], \%a, 0);
- warn("[uniqcmp] read the second file...\n");
- &uniqcmp_aux($ARGV[1], \%a, 1);
- warn("[uniqcmp] stats...\n");
- my @cnt;
- $cnt[$_] = 0 for (0..9);
- for my $x (keys %a) {
- my $p = $a{$x};
- my $z;
- if (defined($p->[0]) && defined($p->[1])) {
- $z = ($p->[0][0] == $p->[1][0] && $p->[0][1] eq $p->[1][1] && abs($p->[0][2] - $p->[1][2]) < $opts{s})? 0 : 1;
- if ($p->[0][3] >= $opts{q} && $p->[1][3] >= $opts{q}) {
- ++$cnt[$z*3+0];
- } elsif ($p->[0][3] >= $opts{q}) {
- ++$cnt[$z*3+1];
- } elsif ($p->[1][3] >= $opts{q}) {
- ++$cnt[$z*3+2];
- }
- print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t",
- $p->[0][5]-$p->[1][5], "\n" if ($z && defined($opts{p}) && ($p->[0][3] >= $opts{q} || $p->[1][3] >= $opts{q}));
- } elsif (defined($p->[0])) {
- ++$cnt[$p->[0][3]>=$opts{q}? 6 : 7];
- print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t*\t0\t*\t",
- $p->[0][5], "\n" if (defined($opts{p}) && $p->[0][3] >= $opts{q});
- } else {
- print STDERR "$x\t*\t0\t*\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t",
- -$p->[1][5], "\n" if (defined($opts{p}) && $p->[1][3] >= $opts{q});
- ++$cnt[$p->[1][3]>=$opts{q}? 8 : 9];
- }
- }
- print "Consistent (high, high): $cnt[0]\n";
- print "Consistent (high, low ): $cnt[1]\n";
- print "Consistent (low , high): $cnt[2]\n";
- print "Inconsistent (high, high): $cnt[3]\n";
- print "Inconsistent (high, low ): $cnt[4]\n";
- print "Inconsistent (low , high): $cnt[5]\n";
- print "Second missing (high): $cnt[6]\n";
- print "Second missing (low ): $cnt[7]\n";
- print "First missing (high): $cnt[8]\n";
- print "First missing (low ): $cnt[9]\n";
-}
-
-sub uniqcmp_aux {
- my ($fn, $a, $which) = @_;
- my $fh;
- $fn = "samtools view $fn |" if ($fn =~ /\.bam/);
- open($fh, $fn) || die;
- while (<$fh>) {
- my @t = split;
- next if (@t < 11);
-# my $l = ($t[5] =~ /^(\d+)S/)? $1 : 0;
- my $l = 0;
- my ($x, $nm) = (0, 0);
- $nm = $1 if (/NM:i:(\d+)/);
- $_ = $t[5];
- s/(\d+)[MI]/$x+=$1/eg;
- @{$a->{$t[0]}[$which]} = (($t[1]&0x10)? 1 : 0, $t[2], $t[3]-$l, $t[4], "$x:$nm", $x - 4 * $nm);
- }
- close($fh);
-}
-
-sub plp2vcf {
- while (<>) {
- my @t = split;
- next if ($t[3] eq '*/*');
- if ($t[2] eq '*') { # indel
- my @s = split("/", $t[3]);
- my (@a, @b);
- my ($ref, $alt);
- for (@s) {
- next if ($_ eq '*');
- if (/^-/) {
- push(@a, 'N'.substr($_, 1));
- push(@b, 'N');
- } elsif (/^\+/) {
- push(@a, 'N');
- push(@b, 'N'.substr($_, 1));
- }
- }
- if ($a[0] && $a[1]) {
- if (length($a[0]) < length($a[1])) {
- $ref = $a[1];
- $alt = ($b[0] . ('N' x (length($a[1]) - length($a[0])))) . ",$b[1]";
- } elsif (length($a[0]) > length($a[1])) {
- $ref = $a[0];
- $alt = ($b[1] . ('N' x (length($a[0]) - length($a[1])))) . ",$b[0]";
- } else {
- $ref = $a[0];
- $alt = ($b[0] eq $b[1])? $b[0] : "$b[0],$b[1]";
- }
- } else {
- $ref = $a[0]; $alt = $b[0];
- }
- print join("\t", @t[0,1], '.', $ref, $alt, $t[5], '.', '.'), "\n";
- } else { # SNP
- }
- }
-}
-
-#
-# Usage
-#
-
-sub usage {
- die(qq/
-Program: samtools.pl (helper script for SAMtools)
-Version: $version
-Contact: Heng Li <lh3\@sanger.ac.uk>\n
-Usage: samtools.pl <command> [<arguments>]\n
-Command: varFilter filtering SNPs and short indels
- pileup2fq generate fastq from `pileup -c'
- showALEN print alignment length (ALEN) following CIGAR
-\n/);
-}
diff --git a/sam/misc/soap2sam.pl b/sam/misc/soap2sam.pl
deleted file mode 100755
index b37135e..0000000
--- a/sam/misc/soap2sam.pl
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/perl -w
-
-# Contact: lh3
-# Version: 0.1.1
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-&soap2sam;
-exit;
-
-sub mating {
- my ($s1, $s2) = @_;
- my $isize = 0;
- if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
- my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
- my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
- $isize = $x2 - $x1;
- }
- # update mate coordinate
- if ($s2->[2] ne '*') {
- @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
- $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
- } else {
- $s1->[1] |= 0x8;
- }
- if ($s1->[2] ne '*') {
- @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
- $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
- } else {
- $s2->[1] |= 0x8;
- }
-}
-
-sub soap2sam {
- my %opts = ();
- getopts("p", \%opts);
- die("Usage: soap2sam.pl [-p] <aln.soap>\n") if (@ARGV == 0 && -t STDIN);
- my $is_paired = defined($opts{p});
- # core loop
- my @s1 = ();
- my @s2 = ();
- my ($s_last, $s_curr) = (\@s1, \@s2);
- while (<>) {
- s/[\177-\377]|[\000-\010]|[\012-\040]//g;
- next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0);
- if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
- &mating($s_last, $s_curr);
- print join("\t", @$s_last), "\n";
- print join("\t", @$s_curr), "\n";
- @$s_last = (); @$s_curr = ();
- } else {
- print join("\t", @$s_last), "\n" if (@$s_last != 0);
- my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
- }
- }
- print join("\t", @$s_last), "\n" if (@$s_last != 0);
-}
-
-sub soap2sam_aux {
- my ($line, $s, $is_paired) = @_;
- chomp($line);
- my @t = split(/\s+/, $line);
- return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]);
- @$s = ();
- # fix SOAP-2.1.x bugs
- @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/);
- # read name
- $s->[0] = $t[0];
- $s->[0] =~ s/\/[12]$//g;
- # initial flag (will be updated later)
- $s->[1] = 0;
- $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7);
- $s->[1] |= 2 if ($is_paired);
- # read & quality
- $s->[9] = $t[1];
- $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2];
- # cigar
- $s->[5] = length($s->[9]) . "M";
- # coor
- $s->[2] = $t[7]; $s->[3] = $t[8];
- $s->[1] |= 0x10 if ($t[6] eq '-');
- # mapQ
- $s->[4] = $t[3] == 1? 30 : 0;
- # mate coordinate
- $s->[6] = '*'; $s->[7] = $s->[8] = 0;
- # aux
- push(@$s, "NM:i:$t[9]");
- my $md = '';
- if ($t[9]) {
- my @x;
- for (10 .. $#t) {
- push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i);
- }
- @x = sort(@x);
- my $a = 0;
- for (@x) {
- my ($y, $z) = split(",");
- $md .= (int($y)-$a) . $z;
- $a += $y - $a + 1;
- }
- $md .= length($t[1]) - $a;
- } else {
- $md = length($t[1]);
- }
- push(@$s, "MD:Z:$md");
- return 0;
-}
diff --git a/sam/misc/varfilter.py b/sam/misc/varfilter.py
deleted file mode 100755
index 03ce395..0000000
--- a/sam/misc/varfilter.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#!/software/bin/python
-
-# Author: lh3, converted to python and modified to add -C option by Aylwyn Scally
-#
-# About:
-# varfilter.py is a port of Heng's samtools.pl varFilter script into
-# python, with an additional -C INT option. This option sets a minimum
-# consensus score, above which the script will output a pileup line
-# wherever it _could have_ called a variant, even if none is actually
-# called (i.e. hom-ref positions). This is important if you want to
-# subsequently merge the calls with those for another individual to get a
-# synoptic view of calls at each site. Without this option, and in all
-# other respects, it behaves like samtools.pl varFilter.
-#
-# Aylwyn Scally ***@sanger.ac.uk
-
-
-# Filtration code:
-#
-# C low CNS quality (hom-ref only)
-# d low depth
-# D high depth
-# W too many SNPs in a window (SNP only)
-# G close to a high-quality indel (SNP only)
-# Q low RMS mapping quality (SNP only)
-# g close to another indel with higher quality (indel only)
-# s low SNP quality (SNP only)
-# i low indel quality (indel only)
-
-
-import sys
-import getopt
-
-def usage():
- print '''usage: varfilter.py [options] [cns-pileup]
-
-Options: -Q INT minimum RMS mapping quality for SNPs
- -q INT minimum RMS mapping quality for gaps
- -d INT minimum read depth
- -D INT maximum read depth
- -S INT minimum SNP quality
- -i INT minimum indel quality
- -C INT minimum consensus quality for hom-ref sites
-
- -G INT min indel score for nearby SNP filtering
- -w INT SNP within INT bp around a gap to be filtered
-
- -W INT window size for filtering dense SNPs
- -N INT max number of SNPs in a window
-
- -l INT window size for filtering adjacent gaps
-
- -p print filtered variants'''
-
-def varFilter_aux(first, is_print):
- try:
- if first[1] == 0:
- sys.stdout.write("\t".join(first[4:]) + "\n")
- elif is_print:
- sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n")
- except IOError:
- sys.exit()
-
-mindepth = 3
-maxdepth = 100
-gapgapwin = 30
-minsnpmapq = 25
-mingapmapq = 10
-minindelscore = 25
-scorefactor = 100
-snpgapwin = 10
-densesnpwin = 10
-densesnps = 2
-printfilt = False
-minsnpq = 0
-minindelq = 0
-mincnsq = 0
-
-try:
- options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', [])
-except getopt.GetoptError:
- usage()
- sys.exit(2)
-for (oflag, oarg) in options:
- if oflag == '-d': mindepth = int(oarg)
- if oflag == '-D': maxdepth = int(oarg)
- if oflag == '-l': gapgapwin = int(oarg)
- if oflag == '-Q': minsnpmapq = int(oarg)
- if oflag == '-q': mingapmapq = int(oarg)
- if oflag == '-G': minindelscore = int(oarg)
- if oflag == '-s': scorefactor = int(oarg)
- if oflag == '-w': snpgapwin = int(oarg)
- if oflag == '-W': densesnpwin = int(oarg)
- if oflag == '-C': mincnsq = int(oarg)
- if oflag == '-N': densesnps = int(oarg)
- if oflag == '-p': printfilt = True
- if oflag == '-S': minsnpq = int(oarg)
- if oflag == '-i': minindelq = int(oarg)
-
-if len(args) < 1:
- inp = sys.stdin
-else:
- inp = open(args[0])
-
-# calculate the window size
-max_dist = max(gapgapwin, snpgapwin, densesnpwin)
-
-staging = []
-for t in (line.strip().split() for line in inp):
- (flt, score) = (0, -1)
- # non-var sites
- if t[3] == '*/*':
- continue
- is_snp = t[2].upper() != t[3].upper()
- if not (is_snp or mincnsq):
- continue
- # clear the out-of-range elements
- while staging:
- # Still on the same chromosome and the first element's window still affects this position?
- if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]):
- break
- varFilter_aux(staging.pop(0), printfilt)
-
- # first a simple filter
- if int(t[7]) < mindepth:
- flt = 2
- elif int(t[7]) > maxdepth:
- flt = 3
- if t[2] == '*': # an indel
- if minindelq and minindelq > int(t[5]):
- flt = 8
- elif is_snp:
- if minsnpq and minsnpq> int(t[5]):
- flt = 7
- else:
- if mincnsq and mincnsq > int(t[4]):
- flt = 9
-
- # site dependent filters
- dlen = 0
- if flt == 0:
- if t[2] == '*': # an indel
- # If deletion, remember the length of the deletion
- (a,b) = t[3].split('/')
- alen = len(a) - 1
- blen = len(b) - 1
- if alen>blen:
- if a[0] == '-': dlen=alen
- elif b[0] == '-': dlen=blen
-
- if int(t[6]) < mingapmapq:
- flt = 1
- # filtering SNPs
- if int(t[5]) >= minindelscore:
- for x in (y for y in staging if y[3]):
- # Is it a SNP and is it outside the SNP filter window?
- if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
- continue
- if x[1] == 0:
- x[1] = 5
-
- # calculate the filtering score (different from indel quality)
- score = int(t[5])
- if t[8] != '*':
- score += scorefactor * int(t[10])
- if t[9] != '*':
- score += scorefactor * int(t[11])
- # check the staging list for indel filtering
- for x in (y for y in staging if y[3]):
- # Is it a SNP and is it outside the gap filter window
- if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]):
- continue
- if x[0] < score:
- x[1] = 6
- else:
- flt = 6
- break
- else: # a SNP or hom-ref
- if int(t[6]) < minsnpmapq:
- flt = 1
- # check adjacent SNPs
- k = 1
- for x in (y for y in staging if y[3]):
- if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5):
- k += 1
-
- # filtering is necessary
- if k > densesnps:
- flt = 4
- for x in (y for y in staging if y[3]):
- if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0:
- x[1] = 4
- else: # then check gap filter
- for x in (y for y in staging if y[3]):
- if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
- continue
- if x[0] >= minindelscore:
- flt = 5
- break
-
- staging.append([score, flt, dlen, is_snp] + t)
-
-# output the last few elements in the staging list
-while staging:
- varFilter_aux(staging.pop(0), printfilt)
diff --git a/sam/misc/vcfutils.lua b/sam/misc/vcfutils.lua
deleted file mode 100755
index 51d374e..0000000
--- a/sam/misc/vcfutils.lua
+++ /dev/null
@@ -1,694 +0,0 @@
-#!/usr/bin/env luajit
-
------------------------------------
--- BEGIN: routines from klib.lua --
------------------------------------
-
--- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt()
---[[ Example:
- for o, a in os.getopt(arg, 'a:b') do
- print(o, a)
- end
-]]--
-function os.getopt(args, ostr)
- local arg, place = nil, 0;
- return function ()
- if place == 0 then -- update scanning pointer
- place = 1
- if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
- if #args[1] >= 2 then
- place = place + 1
- if args[1]:sub(2, 2) == '-' then -- found "--"
- table.remove(args, 1);
- place = 0
- return nil;
- end
- end
- end
- local optopt = place <= #args[1] and args[1]:sub(place, place) or nil
- place = place + 1;
- local oli = optopt and ostr:find(optopt) or nil
- if optopt == ':' or oli == nil then -- unknown option
- if optopt == '-' then return nil end
- if place > #args[1] then
- table.remove(args, 1);
- place = 0;
- end
- return '?';
- end
- oli = oli + 1;
- if ostr:sub(oli, oli) ~= ':' then -- do not need argument
- arg = nil;
- if place > #args[1] then
- table.remove(args, 1);
- place = 0;
- end
- else -- need an argument
- if place <= #args[1] then -- no white space
- arg = args[1]:sub(place);
- else
- table.remove(args, 1);
- if #args == 0 then -- an option requiring argument is the last one
- place = 0;
- if ostr:sub(1, 1) == ':' then return ':' end
- return '?';
- else arg = args[1] end
- end
- table.remove(args, 1);
- place = 0;
- end
- return optopt, arg;
- end
-end
-
--- Description: string split
-function string:split(sep, n)
- local a, start = {}, 1;
- sep = sep or "%s+";
- repeat
- local b, e = self:find(sep, start);
- if b == nil then
- table.insert(a, self:sub(start));
- break
- end
- a[#a+1] = self:sub(start, b - 1);
- start = e + 1;
- if n and #a == n then
- table.insert(a, self:sub(start));
- break
- end
- until start > #self;
- return a;
-end
-
--- Description: smart file open
-function io.xopen(fn, mode)
- mode = mode or 'r';
- if fn == nil then return io.stdin;
- elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
- elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
- elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
- else return io.open(fn, mode) end
-end
-
--- Description: log gamma function
--- Required by: math.lbinom()
--- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
-function math.lgamma(z)
- local x;
- x = 0.1659470187408462e-06 / (z+7);
- x = x + 0.9934937113930748e-05 / (z+6);
- x = x - 0.1385710331296526 / (z+5);
- x = x + 12.50734324009056 / (z+4);
- x = x - 176.6150291498386 / (z+3);
- x = x + 771.3234287757674 / (z+2);
- x = x - 1259.139216722289 / (z+1);
- x = x + 676.5203681218835 / z;
- x = x + 0.9999999999995183;
- return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5);
-end
-
--- Description: regularized incomplete gamma function
--- Dependent on: math.lgamma()
---[[
- Formulas are taken from Wiki, with additional input from Numerical
- Recipes in C (for modified Lentz's algorithm) and AS245
- (http://lib.stat.cmu.edu/apstat/245).
-
- A good online calculator is available at:
-
- http://www.danielsoper.com/statcalc/calc23.aspx
-
- It calculates upper incomplete gamma function, which equals
- math.igamma(s,z,true)*math.exp(math.lgamma(s))
-]]--
-function math.igamma(s, z, complement)
-
- local function _kf_gammap(s, z)
- local sum, x = 1, 1;
- for k = 1, 100 do
- x = x * z / (s + k);
- sum = sum + x;
- if x / sum < 1e-14 then break end
- end
- return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum));
- end
-
- local function _kf_gammaq(s, z)
- local C, D, f, TINY;
- f = 1. + z - s; C = f; D = 0.; TINY = 1e-290;
- -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2
- for j = 1, 100 do
- local d;
- local a, b = j * (s - j), j*2 + 1 + z - s;
- D = b + a * D;
- if D < TINY then D = TINY end
- C = b + a / C;
- if C < TINY then C = TINY end
- D = 1. / D;
- d = C * D;
- f = f * d;
- if math.abs(d - 1) < 1e-14 then break end
- end
- return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f));
- end
-
- if complement then
- return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z);
- else
- return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z));
- end
-end
-
-function math.brent(func, a, b, tol)
- local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100
-
- local fa, fb = func(a, data), func(b, data)
- if fb > fa then -- swap, such that f(a) > f(b)
- a, b, fa, fb = b, a, fb, fa
- end
- local c = b + gold1 * (b - a)
- local fc = func(c) -- golden section extrapolation
- while fb > fc do
- local bound = b + 100.0 * (c - b) -- the farthest point where we want to go
- local r = (b - a) * (fb - fc)
- local q = (b - c) * (fb - fa)
- if math.abs(q - r) < tiny then -- avoid 0 denominator
- tmp = q > r and tiny or 0.0 - tiny
- else tmp = q - r end
- u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point
- if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c
- fu = func(u)
- if fu < fc then -- (b,u,c) bracket the minimum
- a, b, fa, fb = b, u, fb, fu
- break
- elseif fu > fb then -- (a,b,u) bracket the minimum
- c, fc = u, fu
- break
- end
- u = c + gold1 * (c - b)
- fu = func(u) -- golden section extrapolation
- elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound
- fu = func(u)
- if fu < fc then -- fb > fc > fu
- b, c, u = c, u, c + gold1 * (c - b)
- fb, fc, fu = fc, fu, func(u)
- else -- (b,c,u) bracket the minimum
- a, b, c = b, c, u
- fa, fb, fc = fb, fc, fu
- break
- end
- elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound
- u = bound
- fu = func(u)
- else -- u goes the other way around, use golden section extrapolation
- u = c + gold1 * (c - b)
- fu = func(u)
- end
- a, b, c = b, c, u
- fa, fb, fc = fb, fc, fu
- end
- if a > c then a, c = c, a end -- swap
-
- -- now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
- local e, d = 0, 0
- local w, v, fw, fv
- w, v = b, b
- fw, fv = fb, fb
- for iter = 1, max_iter do
- local mid = 0.5 * (a + c)
- local tol1 = tol * math.abs(b) + tiny
- local tol2 = 2.0 * tol1
- if math.abs(b - mid) <= tol2 - 0.5 * (c - a) then return fb, b end -- found
- if math.abs(e) > tol1 then
- -- related to parabolic interpolation
- local r = (b - w) * (fb - fv)
- local q = (b - v) * (fb - fw)
- local p = (b - v) * q - (b - w) * r
- q = 2.0 * (q - r)
- if q > 0.0 then p = 0.0 - p
- else q = 0.0 - q end
- eold, e = e, d
- if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then
- e = b >= mid and a - b or c - b
- d = gold2 * e
- else
- d, u = p / q, b + d -- actual parabolic interpolation happens here
- if u - a < tol2 or c - u < tol2 then
- d = mid > b and tol1 or 0.0 - tol1
- end
- end
- else -- golden section interpolation
- e = b >= min and a - b or c - b
- d = gold2 * e
- end
- u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1);
- fu = func(u)
- if fu <= fb then -- u is the minimum point so far
- if u >= b then a = b
- else c = b end
- v, w, b = w, b, u
- fv, fw, fb = fw, fb, fu
- else -- adjust (a,c) and (u,v,w)
- if u < b then a = u
- else c = u end
- if fu <= fw or w == b then
- v, w = w, u
- fv, fw = fw, fu
- elseif fu <= fv or v == b or v == w then
- v, fv = u, fu;
- end
- end
- end
- return fb, b
-end
-
-matrix = {}
-
--- Description: chi^2 test for contingency tables
--- Dependent on: math.igamma()
-function matrix.chi2(a)
- if #a == 2 and #a[1] == 2 then -- 2x2 table
- local x, z
- x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2])
- if x == 0 then return 0, 1, false end
- z = a[1][1] * a[2][2] - a[1][2] * a[2][1]
- z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x
- return z, math.igamma(.5, .5 * z, true), true
- else -- generic table
- local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0
- for i = 1, n do rs[i] = 0 end
- for j = 1, m do cs[j] = 0 end
- for i = 1, n do -- compute column sum and row sum
- for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end
- end
- for i = 1, n do N = N + rs[i] end
- for i = 1, n do -- compute the chi^2 statistics
- for j = 1, m do
- local E = rs[i] * cs[j] / N;
- z = z + (a[i][j] - E) * (a[i][j] - E) / E
- end
- end
- return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true;
- end
-end
-
----------------------------------
--- END: routines from klib.lua --
----------------------------------
-
-
---------------------------
--- BEGIN: misc routines --
---------------------------
-
--- precompute an array for PL->probability conversion
--- @param m maximum PL
-function algo_init_q2p(m)
- local q2p = {}
- for i = 0, m do
- q2p[i] = math.pow(10, -i / 10)
- end
- return q2p
-end
-
--- given the haplotype frequency, compute r^2
--- @param f 4 haplotype frequencies; f[] is 0-indexed.
--- @return r^2
-function algo_r2(f)
- local p = { f[0] + f[1], f[0] + f[2] }
- local D = f[0] * f[3] - f[1] * f[2]
- return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2]))
-end
-
--- parse a VCF line to get PL
--- @param q2p is computed by algo_init_q2p()
-function text_parse_pl(t, q2p, parse_GT)
- parse_GT = parse_GT == nil and true or false
- local ht, gt, pl = {}, {}, {}
- local s, j0 = t[9]:split(':'), 0
- for j = 1, #s do
- if s[j] == 'PL' then j0 = j break end
- end
- local has_GT = (s[1] == 'GT' and parse_GT) and true or false
- for i = 10, #t do
- if j0 > 0 then
- local s = t[i]:split(':')
- local a, b = 1, s[j0]:find(',')
- pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))]
- a, b = b + 1, s[j0]:find(',', b + 1)
- pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))]
- a, b = b + 1, s[j0]:find(',', b + 1)
- pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))]
- end
- if has_GT then
- if t[i]:sub(1, 1) ~= '.' then
- local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3));
- gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6
- gt[#gt - 2 + g] = 1
- ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3));
- else
- gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1
- ht[#ht+1] = -1; ht[#ht+1] = -1;
- end
- end
--- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt])
- end
- if #pl == 0 then pl = nil end
- local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl }
- return x
-end
-
--- Infer haplotype frequency
--- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed.
--- @param eps precision [1e-5]
--- @return 2-locus haplotype frequencies, 0-indexed array
-function algo_hapfreq2(pdg, eps)
- eps = eps or 1e-5
- local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25}
- for iter = 1, 100 do
- local F = {[0]=0, 0, 0, 0}
- for i = 0, n - 1 do
- local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]}
- local u = { [0]=
- f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]),
- f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]),
- f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]),
- f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2])
- }
- local s = u[0] + u[1] + u[2] + u[3]
- s = 1 / (s * n)
- F[0] = F[0] + u[0] * s
- F[1] = F[1] + u[1] * s
- F[2] = F[2] + u[2] * s
- F[3] = F[3] + u[3] * s
- end
- local e = 0
- for k = 0, 3 do
- e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e
- end
- for k = 0, 3 do f[k] = F[k] end
- if e < eps then break end
--- print(f[0], f[1], f[2], f[3])
- end
- return f
-end
-
-------------------------
--- END: misc routines --
-------------------------
-
-
----------------------
--- BEGIN: commands --
----------------------
-
--- CMD vcf2bgl: convert PL tagged VCF to Beagle input --
-function cmd_vcf2bgl()
- if #arg == 0 then
- print("\nUsage: vcf2bgl.lua <in.vcf>")
- print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n");
- os.exit(1)
- end
-
- local lookup = {}
- for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end
-
- local fp = io.xopen(arg[1])
- for l in fp:lines() do
- if l:sub(1, 2) == '##' then -- meta lines; do nothing
- elseif l:sub(1, 1) == '#' then -- sample lines
- local t, s = l:split('\t'), {}
- for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end
- print('marker', 'alleleA', 'alleleB', table.concat(s, '\t'))
- else -- data line
- local t = l:split('\t');
- if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP
- local x, z = -1, {};
- if t[9]:find('PL') then
- for i = 10, #t do
- local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)')
- AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa);
- if AA ~= nil then
- z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa];
- else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end
- end
- print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t'))
- elseif t[9]:find('GL') then
- print('Error: not implemented')
- os.exit(1)
- end
- end
- end
- end
- fp:close()
-end
-
--- CMD bgl2vcf: convert Beagle output to VCF
-function cmd_bgl2vcf()
- if #arg < 2 then
- print('Usage: bgl2vcf.lua <in.phased> <in.gprobs>')
- os.exit(1)
- end
-
- local fpp = io.xopen(arg[1]);
- local fpg = io.xopen(arg[2]);
- for lg in fpg:lines() do
- local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {}
- if tp[1] == 'I' then
- for i = 3, #tp, 2 do a[#a+1] = tp[i] end
- print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t'))
- else
- local chr, pos = tg[1]:match('(%S+):(%d+)$')
- a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'}
- for i = 3, #tp, 2 do
- a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1)
- end
- print(table.concat(a, '\t'))
- end
- end
- fpg:close(); fpp:close();
-end
-
--- CMD freq: count alleles in each population
-function cmd_freq()
- -- parse the command line
- local site_only = true; -- print site allele frequency or not
- for c in os.getopt(arg, 's') do
- if c == 's' then site_only = false end
- end
- if #arg == 0 then
- print("\nUsage: vcfutils.lua freq [-s] <in.vcf> [samples.txt]\n")
- print("NB: 1) This command only considers biallelic variants.")
- print(" 2) Apply '-s' to get the allele frequency spectrum.")
- print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.")
- print("")
- os.exit(1)
- end
-
- -- read the sample-population pairs
- local pop, sample = {}, {}
- if #arg > 1 then
- local fp = io.xopen(arg[2]);
- for l in fp:lines() do
- local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair
- sample[s] = p; -- FIXME: check duplications
- if pop[p] then table.insert(pop[p], s)
- else pop[p] = {s} end
- end
- fp:close();
- end
- pop['NA'] = {}
-
- -- parse VCF
- fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin;
- local col, cnt = {}, {};
- for k in pairs(pop) do
- col[k], cnt[k] = {}, {[0]=0};
- end
- for l in fp:lines() do
- if l:sub(1, 2) == '##' then -- meta lines; do nothing
- elseif l:sub(1, 1) == '#' then -- the sample line
- local t, del_NA = l:split('\t'), true;
- for i = 10, #t do
- local k = sample[t[i]]
- if k == nil then
- k, del_NA = 'NA', false
- table.insert(pop[k], t[i])
- end
- table.insert(col[k], i);
- table.insert(cnt[k], 0);
- table.insert(cnt[k], 0);
- end
- if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end
- else -- data lines
- local t = l:split('\t');
- if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic
- if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end
- for k, v in pairs(col) do
- local ac, an = 0, 0;
- for i = 1, #v do
- local a1, a2 = t[v[i]]:match("^(%d).(%d)");
- if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end
- end
- if site_only == true then io.write('\t', k, ':', an, ':', ac) end
- if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end
- end
- if site_only == true then io.write('\n') end
- end
- end
- end
- fp:close();
-
- -- print
- if site_only == false then
- for k, v in pairs(cnt) do
- io.write(k .. "\t" .. #v);
- for i = 0, #v do io.write("\t" .. v[i]) end
- io.write('\n');
- end
- end
-end
-
-function cmd_vcf2chi2()
- if #arg < 3 then
- print("Usage: vcfutils.lua vcf2chi2 <in.vcf> <group1.list> <group2.list>");
- os.exit(1)
- end
-
- local g = {};
-
- -- read the list of groups
- local fp = io.xopen(arg[2]);
- for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate
- fp:close()
- fp = io.xopen(arg[3]);
- for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end
- fp:close()
-
- -- process VCF
- fp = io.xopen(arg[1])
- local h = {{}, {}}
- for l in fp:lines() do
- if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing
- elseif l:sub(1, 1) == '#' then -- sample lines
- local t = l:split('\t');
- for i = 10, #t do
- if g[t[i]] == 1 then table.insert(h[1], i)
- elseif g[t[i]] == 2 then table.insert(h[2], i) end
- end
- while #t > 8 do table.remove(t) end
- print(table.concat(t, "\t"))
- else -- data line
- local t = l:split('\t');
- if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic
- local a = {{0, 0}, {0, 0}}
- for i = 1, 2 do
- for _, k in pairs(h[i]) do
- if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2
- elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2
- elseif t[k]:find("^0.1") or t[k]:find("^1.0") then
- a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1
- end
- end
- end
- local chi2, p, succ = matrix.chi2(a);
- while #t > 8 do table.remove(t) end
- --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p);
- if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p)
- .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]),
- a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2])))
- else print(table.concat(t, "\t")) end
- end
- end
- end
- fp:close()
-end
-
--- CMD: compute r^2
-function cmd_r2()
- local w, is_ht, is_gt = 1, false, false
- for o, a in os.getopt(arg, 'w:hg') do
- if o == 'w' then w = tonumber(a)
- elseif o == 'h' then is_ht, is_gt = true, true
- elseif o == 'g' then is_gt = true
- end
- end
- if #arg == 0 then
- print("Usage: vcfutils.lua r2 [-hg] [-w 1] <in.vcf>")
- os.exit(1)
- end
- local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023)
- for l in fp:lines() do
- if l:sub(1, 1) ~= '#' then
- local t = l:split('\t')
- local x = text_parse_pl(t, q2p)
- if #t[5] == 1 and t[5] ~= '.' then -- biallelic
- local r2 = {}
- for k = 1, w do
- if is_gt == false then -- use PL
- if stack[k] then
- local pdg = { stack[k][5], x[5] }
- r2[#r2+1] = algo_r2(algo_hapfreq2(pdg))
- else r2[#r2+1] = 0 end
- elseif is_ht == false then -- use unphased GT
- if stack[k] then
- local pdg = { stack[k][4], x[4] }
- r2[#r2+1] = algo_r2(algo_hapfreq2(pdg))
- else r2[#r2+1] = 0 end
- else -- use phased GT
- if stack[k] then
- local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] }
- for i = 1, #ht[1] do
- local j = ht[1][i] * 2 + ht[2][i]
- f[j] = f[j] + 1
- end
- local sum = f[0] + f[1] + f[2] + f[3]
- for k = 0, 3 do f[k] = f[k] / sum end
- r2[#r2+1] = algo_r2(f)
- else r2[#r2+1] = 0 end
- end
- end
- for k = 1, #r2 do
- r2[k] = string.format('%.3f', r2[k])
- end
- print(x[1], x[2], table.concat(r2, '\t'))
- if #stack == w then table.remove(stack, 1) end
- stack[#stack+1] = x
- end
- end
- end
- fp:close()
-end
-
--------------------
--- END: commands --
--------------------
-
-
--------------------
--- MAIN FUNCTION --
--------------------
-
-if #arg == 0 then
- print("\nUsage: vcfutils.lua <command> <arguments>\n")
- print("Command: freq count biallelic alleles in each population")
- print(" r2 compute r^2")
- print(" vcf2chi2 compute 1-degree chi-square between two groups of samples")
- print(" vcf2bgl convert PL annotated VCF to Beagle input")
- print(" bgl2vcf convert Beagle input to VCF")
- print("")
- os.exit(1)
-end
-
-local cmd = arg[1]
-table.remove(arg, 1)
-if cmd == 'vcf2bgl' then cmd_vcf2bgl()
-elseif cmd == 'bgl2vcf' then cmd_bgl2vcf()
-elseif cmd == 'freq' then cmd_freq()
-elseif cmd == 'r2' then cmd_r2()
-elseif cmd == 'vcf2chi2' then cmd_vcf2chi2()
-else
- print('ERROR: unknown command "' .. cmd .. '"')
- os.exit(1)
-end
diff --git a/sam/misc/wgsim.c b/sam/misc/wgsim.c
deleted file mode 100644
index b9c513c..0000000
--- a/sam/misc/wgsim.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008 Genome Research Ltd (GRL).
- 2011 Heng Li <***@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* This program is separated from maq's read simulator with Colin
- * Hercus' modification to allow longer indels. */
-
-#include <stdlib.h>
-#include <math.h>
-#include <time.h>
-#include <assert.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <ctype.h>
-#include <string.h>
-#include <zlib.h>
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
-#define PACKAGE_VERSION "0.3.0"
-
-const uint8_t nst_nt4_table[256] = {
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
-};
-
-/* Simple normal random number generator, copied from genran.c */
-
-double ran_normal()
-{
- static int iset = 0;
- static double gset;
- double fac, rsq, v1, v2;
- if (iset == 0) {
- do {
- v1 = 2.0 * drand48() - 1.0;
- v2 = 2.0 * drand48() - 1.0;
- rsq = v1 * v1 + v2 * v2;
- } while (rsq >= 1.0 || rsq == 0.0);
- fac = sqrt(-2.0 * log(rsq) / rsq);
- gset = v1 * fac;
- iset = 1;
- return v2 * fac;
- } else {
- iset = 0;
- return gset;
- }
-}
-
-/* wgsim */
-
-enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
-typedef unsigned short mut_t;
-static mut_t mutmsk = (mut_t)0xf000;
-
-typedef struct {
- int l, m; /* length and maximum buffer size */
- mut_t *s; /* sequence */
-} mutseq_t;
-
-static double ERR_RATE = 0.02;
-static double MUT_RATE = 0.001;
-static double INDEL_FRAC = 0.15;
-static double INDEL_EXTEND = 0.3;
-static double MAX_N_RATIO = 0.1;
-
-void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
-{
- int i, deleting = 0;
- mutseq_t *ret[2];
-
- ret[0] = hap1; ret[1] = hap2;
- ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
- ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
- ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
- ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
- for (i = 0; i != ks->seq.l; ++i) {
- int c;
- c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
- if (deleting) {
- if (drand48() < INDEL_EXTEND) {
- if (deleting & 1) ret[0]->s[i] |= DELETE;
- if (deleting & 2) ret[1]->s[i] |= DELETE;
- continue;
- } else deleting = 0;
- }
- if (c < 4 && drand48() < MUT_RATE) { // mutation
- if (drand48() >= INDEL_FRAC) { // substitution
- double r = drand48();
- c = (c + (int)(r * 3.0 + 1)) & 3;
- if (is_hap || drand48() < 0.333333) { // hom
- ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
- } else { // het
- ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
- }
- } else { // indel
- if (drand48() < 0.5) { // deletion
- if (is_hap || drand48() < 0.333333) { // hom-del
- ret[0]->s[i] = ret[1]->s[i] = DELETE;
- deleting = 3;
- } else { // het-del
- deleting = drand48()<0.5?1:2;
- ret[deleting-1]->s[i] = DELETE;
- }
- } else { // insertion
- int num_ins = 0, ins = 0;
- do {
- num_ins++;
- ins = (ins << 2) | (int)(drand48() * 4.0);
- } while (num_ins < 4 && drand48() < INDEL_EXTEND);
-
- if (is_hap || drand48() < 0.333333) { // hom-ins
- ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
- } else { // het-ins
- ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
- }
- }
- }
- }
- }
-}
-void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
-{
- int i;
- for (i = 0; i != ks->seq.l; ++i) {
- int c[3];
- c[0] = nst_nt4_table[(int)ks->seq.s[i]];
- c[1] = hap1->s[i]; c[2] = hap2->s[i];
- if (c[0] >= 4) continue;
- if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
- printf("%s\t%d\t", name, i+1);
- if (c[1] == c[2]) { // hom
- if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
- printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
- } else if ((c[1]&mutmsk) == DELETE) { // del
- printf("%c\t-\t-\n", "ACGTN"[c[0]]);
- } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
- printf("-\t");
- int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
- while (n > 0) {
- putchar("ACGTN"[ins & 0x3]);
- ins >>= 2;
- n--;
- }
- printf("\t-\n");
- } else assert(0);
- } else { // het
- if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
- printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
- } else if ((c[1]&mutmsk) == DELETE) {
- printf("%c\t-\t+\n", "ACGTN"[c[0]]);
- } else if ((c[2]&mutmsk) == DELETE) {
- printf("%c\t-\t+\n", "ACGTN"[c[0]]);
- } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
- printf("-\t");
- int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
- while (n > 0) {
- putchar("ACGTN"[ins & 0x3]);
- ins >>= 2;
- n--;
- }
- printf("\t+\n");
- } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
- printf("-\t");
- int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
- while (n > 0) {
- putchar("ACGTN"[ins & 0x3]);
- ins >>= 2;
- n--;
- }
- printf("\t+\n");
- } else assert(0);
- }
- }
- }
-}
-
-void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
-{
- kseq_t *ks;
- mutseq_t rseq[2];
- gzFile fp_fa;
- uint64_t tot_len, ii;
- int i, l, n_ref;
- char *qstr;
- int size[2], Q, max_size;
- uint8_t *tmp_seq[2];
- mut_t *target;
-
- l = size_l > size_r? size_l : size_r;
- qstr = (char*)calloc(l+1, 1);
- tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
- tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
- size[0] = size_l; size[1] = size_r;
- max_size = size_l > size_r? size_l : size_r;
-
- Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
-
- fp_fa = gzopen(fn, "r");
- ks = kseq_init(fp_fa);
- tot_len = n_ref = 0;
- fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
- while ((l = kseq_read(ks)) >= 0) {
- tot_len += l;
- ++n_ref;
- }
- fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
- kseq_destroy(ks);
- gzclose(fp_fa);
-
- fp_fa = gzopen(fn, "r");
- ks = kseq_init(fp_fa);
- while ((l = kseq_read(ks)) >= 0) {
- uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
- if (l < dist + 3 * std_dev) {
- fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
- continue;
- }
-
- // generate mutations and print them out
- wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
- wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
-
- for (ii = 0; ii != n_pairs; ++ii) { // the core loop
- double ran;
- int d, pos, s[2], is_flip = 0;
- int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
- FILE *fpo[2];
-
- do { // avoid boundary failure
- ran = ran_normal();
- ran = ran * std_dev + dist;
- d = (int)(ran + 0.5);
- d = d > max_size? d : max_size;
- pos = (int)((l - d + 1) * drand48());
- } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
-
- // flip or not
- if (drand48() < 0.5) {
- fpo[0] = fpout1; fpo[1] = fpout2;
- s[0] = size[0]; s[1] = size[1];
- } else {
- fpo[1] = fpout1; fpo[0] = fpout2;
- s[1] = size[0]; s[0] = size[1];
- is_flip = 1;
- }
-
- // generate the read sequences
- target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
- n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
-
-#define __gen_read(x, start, iter) do { \
- for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
- int c = target[i], mut_type = c & mutmsk; \
- if (ext_coor[x] < 0) { \
- if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
- ext_coor[x] = i; \
- } \
- if (mut_type == DELETE) ++n_indel[x]; \
- else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
- tmp_seq[x][k++] = c & 0xf; \
- if (mut_type == SUBSTITUTE) ++n_sub[x]; \
- } else { \
- int n, ins; \
- ++n_indel[x]; \
- tmp_seq[x][k++] = c & 0xf; \
- for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
- tmp_seq[x][k++] = ins & 0x3; \
- } \
- } \
- if (k != s[x]) ext_coor[x] = -10; \
- } while (0)
-
- __gen_read(0, pos, ++i);
- __gen_read(1, pos + d - 1, --i);
- for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
- if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
- --ii;
- continue;
- }
-
- // generate sequencing errors
- for (j = 0; j < 2; ++j) {
- int n_n = 0;
- for (i = 0; i < s[j]; ++i) {
- int c = tmp_seq[j][i];
- if (c >= 4) { // actually c should be never larger than 4 if everything is correct
- c = 4;
- ++n_n;
- } else if (drand48() < ERR_RATE) {
- // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
- c = (c + 1) & 3; // recurrent sequencing errors
- ++n_err[j];
- }
- tmp_seq[j][i] = c;
- }
- if ((double)n_n / s[j] > MAX_N_RATIO) break;
- }
- if (j < 2) { // too many ambiguous bases on one of the reads
- --ii;
- continue;
- }
-
- // print
- for (j = 0; j < 2; ++j) {
- for (i = 0; i < s[j]; ++i) qstr[i] = Q;
- qstr[i] = 0;
- fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
- n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
- (long long)ii, j==0? is_flip+1 : 2-is_flip);
- for (i = 0; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
- fprintf(fpo[j], "\n+\n%s\n", qstr);
- }
- }
- free(rseq[0].s); free(rseq[1].s);
- }
- kseq_destroy(ks);
- gzclose(fp_fa);
- free(qstr);
- free(tmp_seq[0]); free(tmp_seq[1]);
-}
-
-static int simu_usage()
-{
- fprintf(stderr, "\n");
- fprintf(stderr, "Program: wgsim (short read simulator)\n");
- fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
- fprintf(stderr, "Contact: Heng Li <***@sanger.ac.uk>\n\n");
- fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
- fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
- fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
- fprintf(stderr, " -s INT standard deviation [50]\n");
- fprintf(stderr, " -N INT number of read pairs [1000000]\n");
- fprintf(stderr, " -1 INT length of the first read [70]\n");
- fprintf(stderr, " -2 INT length of the second read [70]\n");
- fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
- fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
- fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
- fprintf(stderr, " -S INT seed for random generator [-1]\n");
- fprintf(stderr, " -h haplotype mode\n");
- fprintf(stderr, "\n");
- return 1;
-}
-
-int main(int argc, char *argv[])
-{
- int64_t N;
- int dist, std_dev, c, size_l, size_r, is_hap = 0;
- FILE *fpout1, *fpout2;
- int seed = -1;
-
- N = 1000000; dist = 500; std_dev = 50;
- size_l = size_r = 70;
- while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
- switch (c) {
- case 'd': dist = atoi(optarg); break;
- case 's': std_dev = atoi(optarg); break;
- case 'N': N = atoi(optarg); break;
- case '1': size_l = atoi(optarg); break;
- case '2': size_r = atoi(optarg); break;
- case 'e': ERR_RATE = atof(optarg); break;
- case 'r': MUT_RATE = atof(optarg); break;
- case 'R': INDEL_FRAC = atof(optarg); break;
- case 'X': INDEL_EXTEND = atof(optarg); break;
- case 'S': seed = atoi(optarg); break;
- case 'h': is_hap = 1; break;
- }
- }
- if (argc - optind < 3) return simu_usage();
- fpout1 = fopen(argv[optind+1], "w");
- fpout2 = fopen(argv[optind+2], "w");
- if (!fpout1 || !fpout2) {
- fprintf(stderr, "[wgsim] file open error\n");
- return 1;
- }
- srand48(seed > 0? seed : time(0));
- wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
-
- fclose(fpout1); fclose(fpout2);
- return 0;
-}
diff --git a/sam/misc/wgsim_eval.pl b/sam/misc/wgsim_eval.pl
deleted file mode 100755
index f919a06..0000000
--- a/sam/misc/wgsim_eval.pl
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/perl -w
-
-# Contact: lh3
-# Version: 0.1.5
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-&wgsim_eval;
-exit;
-
-sub wgsim_eval {
- my %opts = (g=>5);
- getopts('pcag:', \%opts);
- die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN);
- my (@c0, @c1, %fnfp);
- my ($max_q, $flag) = (0, 0);
- my $gap = $opts{g};
- $flag |= 1 if (defined $opts{p});
- $flag |= 2 if (defined $opts{c});
- while (<>) {
- next if (/^\@/);
- my @t = split("\t");
- next if (@t < 11);
- my $line = $_;
- my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]);
- $max_q = $q if ($q > $max_q);
- # right coordinate
- $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
- --$rght;
- # correct for soft clipping
- my ($left0, $rght0) = ($left, $rght);
- $left -= $1 if (/^(\d+)[SH]/);
- $rght += $1 if (/(\d+)[SH]$/);
- $left0 -= $1 if (/(\d+)[SH]$/);
- $rght0 += $1 if (/^(\d+)[SH]/);
- # skip unmapped reads
- next if (($t[1]&0x4) || $chr eq '*');
- # parse read name and check
- if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) {
- if ($1 ne $chr) { # different chr
- $is_correct = 0;
- } else {
- if ($flag & 2) {
- if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward
- $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
- } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse
- $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap);
- } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward
- $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap);
- } else { # R3, reverse
- $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap);
- }
- } else {
- if ($t[1] & 0x10) { # reverse
- $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads
- } else {
- $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap);
- }
- }
- }
- } else {
- warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n");
- next;
- }
- ++$c0[$q];
- ++$c1[$q] unless ($is_correct);
- @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]});
- ++$fnfp{$t[4]}[0];
- ++$fnfp{$t[4]}[1] unless ($is_correct);
- print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
- }
- # print
- my ($cc0, $cc1) = (0, 0);
- if (!defined($opts{a})) {
- for (my $i = $max_q; $i >= 0; --$i) {
- $c0[$i] = 0 unless (defined $c0[$i]);
- $c1[$i] = 0 unless (defined $c1[$i]);
- $cc0 += $c0[$i]; $cc1 += $c1[$i];
- printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0);
- }
- } else {
- for (reverse(sort {$a<=>$b} (keys %fnfp))) {
- next if ($_ == 0);
- $cc0 += $fnfp{$_}[0];
- $cc1 += $fnfp{$_}[1];
- print join("\t", $_, $cc0, $cc1), "\n";
- }
- }
-}
diff --git a/sam/misc/zoom2sam.pl b/sam/misc/zoom2sam.pl
deleted file mode 100755
index 5306bfa..0000000
--- a/sam/misc/zoom2sam.pl
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/perl -w
-
-# Contact: lh3
-# Version: 0.1.0
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-&zoom2sam;
-exit;
-
-sub mating {
- my ($s1, $s2) = @_;
- my $isize = 0;
- if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize
- my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3];
- my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3];
- $isize = $x2 - $x1;
- }
- # update mate coordinate
- if ($s2->[2] ne '*') {
- @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize);
- $s1->[1] |= 0x20 if ($s2->[1] & 0x10);
- } else {
- $s1->[1] |= 0x8;
- }
- if ($s1->[2] ne '*') {
- @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize);
- $s2->[1] |= 0x20 if ($s1->[1] & 0x10);
- } else {
- $s2->[1] |= 0x8;
- }
-}
-
-sub zoom2sam {
- my %opts = ();
- getopts("p", \%opts);
- die("Usage: zoom2sam.pl [-p] <readLen> <aln.zoom>
-Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2);
- my $is_paired = defined($opts{p});
- my $len = shift(@ARGV);
- # core loop
- my @s1 = ();
- my @s2 = ();
- my ($s_last, $s_curr) = (\@s1, \@s2);
- while (<>) {
- &zoom2sam_aux($_, $s_curr, $is_paired, $len);
- if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) {
- &mating($s_last, $s_curr);
- print join("\t", @$s_last), "\n";
- print join("\t", @$s_curr), "\n";
- @$s_last = (); @$s_curr = ();
- } else {
- print join("\t", @$s_last), "\n" if (@$s_last != 0);
- my $s = $s_last; $s_last = $s_curr; $s_curr = $s;
- }
- }
- print join("\t", @$s_last), "\n" if (@$s_last != 0);
-}
-
-sub zoom2sam_aux {
- my ($line, $s, $is_paired, $len) = @_;
- chomp($line);
- my @t = split("\t", $line);
- @$s = ();
- # read name
- $s->[0] = $t[0];
- # initial flag (will be updated later)
- $s->[1] = 0;
- $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/);
- $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/);
- $s->[1] |= 2 if ($is_paired);
- # read & quality
- $s->[9] = "*"; $s->[10] = "*";
- # cigar
- $s->[5] = $len . "M";
- # coor
- my @s = split(/\s+/, $t[1]);
- $s->[2] = $s[0];
- $t[1] =~ /:(\d+)$/;
- $s->[3] = $1 + 1;
- if ($s->[0] =~ /_[FR]$/) {
- my $u = ($s->[0] =~ /_F$/)? 1 : 0;
- my $w = ($t[2] eq '+')? 1 : 0;
- $s->[1] |= 0x10 if ($u ^ $w);
- $s->[0] =~ s/_[FR]$//;
- } else {
- $s->[1] |= 0x10 if ($t[2] eq '-');
- }
- # mapQ
- $s->[4] = 30;
- # mate coordinate
- $s->[6] = '*'; $s->[7] = $s->[8] = 0;
- # aux
- push(@$s, "NM:i:$t[3]");
-}
diff --git a/sam/padding.c b/sam/padding.c
deleted file mode 100644
index a8da562..0000000
--- a/sam/padding.c
+++ /dev/null
@@ -1,479 +0,0 @@
-#include <string.h>
-#include <assert.h>
-#include <unistd.h>
-#include "kstring.h"
-#include "sam_header.h"
-#include "sam.h"
-#include "bam.h"
-#include "faidx.h"
-
-bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/
-
-static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
-{
- if (n != b->core.n_cigar) {
- int o = b->core.l_qname + b->core.n_cigar * 4;
- if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) {
- b->m_data = b->data_len + (n - b->core.n_cigar) * 4;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
- }
- memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o);
- memcpy(b->data + b->core.l_qname, cigar, n * 4);
- b->data_len += (n - b->core.n_cigar) * 4;
- b->core.n_cigar = n;
- } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
-}
-
-#define write_cigar(_c, _n, _m, _v) do { \
- if (_n == _m) { \
- _m = _m? _m<<1 : 4; \
- _c = (uint32_t*)realloc(_c, _m * 4); \
- } \
- _c[_n++] = (_v); \
- } while (0)
-
-static void unpad_seq(bam1_t *b, kstring_t *s)
-{
- int k, j, i;
- int length;
- uint32_t *cigar = bam1_cigar(b);
- uint8_t *seq = bam1_seq(b);
- // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
- // We need the padded length after alignment from the CIGAR (excluding
- // soft clips S, but including pads from CIGAR D operations)
- length = 0;
- for (k = 0; k < b->core.n_cigar; ++k) {
- int op, ol;
- op= bam_cigar_op(cigar[k]);
- ol = bam_cigar_oplen(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL)
- length += ol;
- }
- ks_resize(s, length);
- for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
- int op, ol;
- op = bam_cigar_op(cigar[k]);
- ol = bam_cigar_oplen(cigar[k]);
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j);
- } else if (op == BAM_CSOFT_CLIP) {
- j += ol;
- } else if (op == BAM_CHARD_CLIP) {
- /* do nothing */
- } else if (op == BAM_CDEL) {
- for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
- } else {
- fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b));
- assert(-1);
- }
- }
- assert(length == s->l);
-}
-
-int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
-{
- char base;
- char *fai_ref = 0;
- int fai_ref_len = 0, k;
-
- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
- if (fai_ref_len != ref_len) {
- fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
- free(fai_ref);
- return -1;
- }
- ks_resize(seq, ref_len);
- seq->l = 0;
- for (k = 0; k < ref_len; ++k) {
- base = fai_ref[k];
- if (base == '-' || base == '*') {
- // Map gaps to null to match unpad_seq function
- seq->s[seq->l++] = 0;
- } else {
- int i = bam_nt16_table[(int)base];
- if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
- fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
- free(fai_ref);
- return -1;
- }
- seq->s[seq->l++] = i;
- }
- }
- assert(ref_len == seq->l);
- free(fai_ref);
- return 0;
-}
-
-int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len)
-{
- char base;
- char *fai_ref = 0;
- int fai_ref_len = 0, k;
- int bases=0, gaps=0;
-
- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
- if (fai_ref_len != padded_len) {
- fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len);
- free(fai_ref);
- return -1;
- }
- for (k = 0; k < padded_len; ++k) {
- //fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref));
- base = fai_ref[k];
- if (base == '-' || base == '*') {
- gaps += 1;
- } else {
- int i = bam_nt16_table[(int)base];
- if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
- fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
- free(fai_ref);
- return -1;
- }
- bases += 1;
- }
- }
- free(fai_ref);
- assert (padded_len == bases + gaps);
- return bases;
-}
-
-inline int * update_posmap(int *posmap, kstring_t ref)
-{
- int i, k;
- posmap = realloc(posmap, ref.m * sizeof(int));
- for (i = k = 0; i < ref.l; ++i) {
- posmap[i] = k;
- if (ref.s[i]) ++k;
- }
- return posmap;
-}
-
-int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
-{
- bam_header_t *h = 0;
- bam1_t *b = 0;
- kstring_t r, q;
- int r_tid = -1;
- uint32_t *cigar2 = 0;
- int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
-
- b = bam_init1();
- r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
- int read_ret;
- h = in->header;
- while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in'
- uint32_t *cigar = bam1_cigar(b);
- n2 = 0;
- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
- // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b));
- r_tid = b->core.tid;
- unpad_seq(b, &r);
- if (h->target_len[r_tid] != r.l) {
- fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l);
- return -1;
- }
- if (fai) {
- // Check the embedded reference matches the FASTA file
- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) {
- fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
- return -1;
- }
- assert(r.l == q.l);
- int i;
- for (i = 0; i < r.l; ++i) {
- if (r.s[i] != q.s[i]) {
- // Show gaps as ASCII 45
- fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
- h->target_name[b->core.tid], i+1,
- r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45,
- q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45);
- return -1;
- }
- }
- }
- write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
- replace_cigar(b, n2, cigar2);
- posmap = update_posmap(posmap, r);
- } else if (b->core.n_cigar > 0) {
- int i, k, op;
- if (b->core.tid < 0) {
- fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b));
- return -1;
- } else if (b->core.tid == r_tid) {
- ; // good case, reference available
- //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b));
- } else if (fai) {
- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
- return -1;
- }
- posmap = update_posmap(posmap, r);
- r_tid = b->core.tid;
- // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
- } else {
- fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
- return -1;
- }
- unpad_seq(b, &q);
- if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
- write_cigar(cigar2, n2, m2, cigar[0]);
- } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
- write_cigar(cigar2, n2, m2, cigar[0]);
- if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) {
- write_cigar(cigar2, n2, m2, cigar[1]);
- }
- }
- /* Determine CIGAR operator for each base in the aligned read */
- for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
- q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
- /* Include any pads if starts with an insert */
- if (q.s[0] == BAM_CINS) {
- for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
- if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
- }
- /* Count consecutive CIGAR operators to turn into a CIGAR string */
- for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
- if (op != q.s[i]) {
- write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
- op = q.s[i]; k = 1;
- } else ++k;
- }
- write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
- if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) {
- write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
- } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) {
- if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) {
- write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]);
- }
- write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
- }
- /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */
- int pre_op, post_op;
- for (i = 2; i < n2; ++i)
- if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) {
- pre_op = bam_cigar_op(cigar2[i-2]);
- post_op = bam_cigar_op(cigar2[i]);
- /* Note don't need to check for X/= as code above will use M only */
- if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) {
- /* This is a redundant P operator */
- cigar2[i-1] = 0; // i.e. 0M
- /* If had same operator either side, combine them in post_op */
- if (pre_op == post_op) {
- /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/
- cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op);
- cigar2[i-2] = 0; // i.e. 0M
- }
- }
- }
- /* Remove the zero'd operators (0M) */
- for (i = k = 0; i < n2; ++i)
- if (cigar2[i]) cigar2[k++] = cigar2[i];
- n2 = k;
- replace_cigar(b, n2, cigar2);
- b->core.pos = posmap[b->core.pos];
- if (b->core.mtid < 0 || b->core.mpos < 0) {
- /* Nice case, no mate to worry about*/
- // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b));
- /* TODO - Warning if FLAG says mate should be mapped? */
- /* Clean up funny input where mate position is given but mate reference is missing: */
- b->core.mtid = -1;
- b->core.mpos = -1;
- } else if (b->core.mtid == b->core.tid) {
- /* Nice case, same reference */
- // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b));
- b->core.mpos = posmap[b->core.mpos];
- } else {
- /* Nasty case, Must load alternative posmap */
- // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
- if (!fai) {
- fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
- return -1;
- }
- /* Temporarily load the other reference sequence */
- if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) {
- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
- return -1;
- }
- posmap = update_posmap(posmap, r);
- b->core.mpos = posmap[b->core.mpos];
- /* Restore the reference and posmap*/
- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
- return -1;
- }
- posmap = update_posmap(posmap, r);
- }
- }
- samwrite(out, b);
- }
- if (read_ret < -1) {
- fprintf(stderr, "[depad] truncated file.\n");
- ret = 1;
- }
- free(r.s); free(q.s); free(posmap);
- bam_destroy1(b);
- return ret;
-}
-
-bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
-{
- int i = 0, unpadded_len = 0;
- bam_header_t *header = 0 ;
-
- header = bam_header_dup(old);
- for (i = 0; i < old->n_targets; ++i) {
- unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
- if (unpadded_len < 0) {
- fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
- } else {
- header->target_len[i] = unpadded_len;
- //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
- }
- }
- /* Duplicating the header allocated new buffer for header string */
- /* After modifying the @SQ lines it will only get smaller, since */
- /* the LN entries will be the same or shorter, and we'll remove */
- /* any MD entries (MD5 checksums). */
- assert(strlen(old->text) == strlen(header->text));
- assert (0==strcmp(old->text, header->text));
- const char *text;
- text = old->text;
- header->text[0] = '\0'; /* Resuse the allocated buffer */
- char * newtext = header->text;
- char * end=NULL;
- while (text[0]=='@') {
- end = strchr(text, '\n');
- assert(end != 0);
- if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
- /* TODO - edit the @SQ line here to remove MD and fix LN. */
- /* For now just remove the @SQ line, and samtools will */
- /* automatically generate a minimal replacement with LN. */
- /* However, that discards any other tags like AS, SP, UR. */
- //fprintf(stderr, "[depad] Removing @SQ line\n");
- } else {
- /* Copy this line to the new header */
- strncat(newtext, text, end - text + 1);
- }
- text = end + 1;
- }
- assert (text[0]=='\0');
- /* Check we didn't overflow the buffer */
- assert (strlen(header->text) <= strlen(old->text));
- if (strlen(header->text) < header->l_text) {
- //fprintf(stderr, "[depad] Reallocating header buffer\n");
- assert (newtext == header->text);
- newtext = malloc(strlen(header->text) + 1);
- strcpy(newtext, header->text);
- free(header->text);
- header->text = newtext;
- header->l_text = strlen(newtext);
- }
- //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
- return header;
-}
-
-static int usage(int is_long_help);
-
-int main_pad2unpad(int argc, char *argv[])
-{
- samfile_t *in = 0, *out = 0;
- bam_header_t *h = 0;
- faidx_t *fai = 0;
- int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0;
- char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
- int ret=0;
-
- /* parse command-line options */
- strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) {
- switch (c) {
- case 'S': is_bamin = 0; break;
- case 's': assert(compress_level == -1); is_bamout = 0; break;
- case 'o': fn_out = strdup(optarg); break;
- case 'u': assert(is_bamout == 1); compress_level = 0; break;
- case '1': assert(is_bamout == 1); compress_level = 1; break;
- case 'T': fn_ref = strdup(optarg); break;
- case '?': is_long_help = 1; break;
- default: return usage(is_long_help);
- }
- }
- if (argc == optind) return usage(is_long_help);
-
- if (is_bamin) strcat(in_mode, "b");
- if (is_bamout) strcat(out_mode, "b");
- strcat(out_mode, "h");
- if (compress_level >= 0) {
- char tmp[2];
- tmp[0] = compress_level + '0'; tmp[1] = '\0';
- strcat(out_mode, tmp);
- }
-
- // Load FASTA reference (also needed for SAM -> BAM if missing header)
- if (fn_ref) {
- fn_list = samfaipath(fn_ref);
- fai = fai_load(fn_ref);
- }
- // open file handlers
- if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
- fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
- ret = 1;
- goto depad_end;
- }
- if (in->header == 0) {
- fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
- ret = 1;
- goto depad_end;
- }
- if (in->header->text == 0 || in->header->l_text == 0) {
- fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]);
- assert (0 == in->header->l_text);
- assert (0 == in->header->text);
- }
- if (fn_ref) {
- h = fix_header(in->header, fai);
- } else {
- fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
- h = in->header;
- }
- if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) {
- fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
- ret = 1;
- goto depad_end;
- }
-
- // Do the depad
- ret = bam_pad2unpad(in, out, fai);
-
-depad_end:
- // close files, free and return
- if (fai) fai_destroy(fai);
- if (h != in->header) bam_header_destroy(h);
- samclose(in);
- samclose(out);
- free(fn_list); free(fn_out);
- return ret;
-}
-
-static int usage(int is_long_help)
-{
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools depad <in.bam>\n\n");
- fprintf(stderr, "Options: -s output is SAM (default is BAM)\n");
- fprintf(stderr, " -S input is SAM (default is BAM)\n");
- fprintf(stderr, " -u uncompressed BAM output (can't use with -s)\n");
- fprintf(stderr, " -1 fast compression BAM output (can't use with -s)\n");
- fprintf(stderr, " -T FILE reference sequence file [null]\n");
- fprintf(stderr, " -o FILE output file name [stdout]\n");
- fprintf(stderr, " -? longer help\n");
- fprintf(stderr, "\n");
- if (is_long_help)
- fprintf(stderr, "Notes:\n\
-\n\
- 1. Requires embedded reference sequences (before the reads for that reference),\n\
- with the future aim to also support a FASTA padded reference sequence file.\n\
-\n\
- 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\
-\n");
- return 1;
-}
diff --git a/sam/phase.c b/sam/phase.c
deleted file mode 100644
index ef4eff9..0000000
--- a/sam/phase.c
+++ /dev/null
@@ -1,687 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <math.h>
-#include <zlib.h>
-#include "bam.h"
-#include "errmod.h"
-
-#include "kseq.h"
-KSTREAM_INIT(gzFile, gzread, 16384)
-
-#define MAX_VARS 256
-#define FLIP_PENALTY 2
-#define FLIP_THRES 4
-#define MASK_THRES 3
-
-#define FLAG_FIX_CHIMERA 0x1
-#define FLAG_LIST_EXCL 0x4
-#define FLAG_DROP_AMBI 0x8
-
-typedef struct {
- // configurations, initialized in the main function
- int flag, k, min_baseQ, min_varLOD, max_depth;
- // other global variables
- int vpos_shift;
- bamFile fp;
- char *pre;
- bamFile out[3];
- // alignment queue
- int n, m;
- bam1_t **b;
-} phaseg_t;
-
-typedef struct {
- int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
- int vpos, beg, end;
- uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1;
- uint32_t in:16, out:16; // in-phase and out-phase
-} frag_t, *frag_p;
-
-#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
-
-#include "khash.h"
-KHASH_SET_INIT_INT64(set64)
-KHASH_MAP_INIT_INT64(64, frag_t)
-
-typedef khash_t(64) nseq_t;
-
-#include "ksort.h"
-KSORT_INIT(rseq, frag_p, rseq_lt)
-
-static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-
-static inline uint64_t X31_hash_string(const char *s)
-{
- uint64_t h = *s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
- return h;
-}
-
-static void count1(int l, const uint8_t *seq, int *cnt)
-{
- int i, j, n_ambi;
- uint32_t z, x;
- if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
- for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
- if (seq[i] == 0) ++n_ambi;
- if (l - n_ambi <= 1) return; // only one SNP
- for (x = 0; x < 1u<<n_ambi; ++x) { // count
- for (i = j = 0, z = 0; i < l; ++i) {
- int c;
- if (seq[i]) c = seq[i] - 1;
- else {
- c = x>>j&1;
- ++j;
- }
- z = z<<1 | c;
- }
- ++cnt[z];
- }
-}
-
-static int **count_all(int l, int vpos, nseq_t *hash)
-{
- khint_t k;
- int i, j, **cnt;
- uint8_t *seq;
- seq = calloc(l, 1);
- cnt = calloc(vpos, sizeof(void*));
- for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
- for (k = 0; k < kh_end(hash); ++k) {
- if (kh_exist(hash, k)) {
- frag_t *f = &kh_val(hash, k);
- if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
- if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
- f->single = 1;
- continue;
- }
- for (j = 1; j < f->vlen; ++j) {
- for (i = 0; i < l; ++i)
- seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
- count1(l, seq, cnt[f->vpos + j]);
- }
- }
- }
- free(seq);
- return cnt;
-}
-
-// phasing
-static int8_t *dynaprog(int l, int vpos, int **w)
-{
- int *f[2], *curr, *prev, max, i;
- int8_t **b, *h = 0;
- uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
- f[0] = calloc(z, sizeof(int));
- f[1] = calloc(z, sizeof(int));
- b = calloc(vpos, sizeof(void*));
- prev = f[0]; curr = f[1];
- // fill the backtrack matrix
- for (i = 0; i < vpos; ++i) {
- int *wi = w[i], *tmp;
- int8_t *bi;
- bi = b[i] = calloc(z, 1);
- /* In the following, x is the current state, which is the
- * lexicographically smaller local haplotype. xc is the complement of
- * x, or the larger local haplotype; y0 and y1 are the two predecessors
- * of x. */
- for (x = 0; x < z; ++x) { // x0 is the smaller
- uint32_t y0, y1, xc;
- int c0, c1;
- xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
- c0 = prev[y0] + wi[x] + wi[xc];
- c1 = prev[y1] + wi[x] + wi[xc];
- if (c0 > c1) bi[x] = 0, curr[x] = c0;
- else bi[x] = 1, curr[x] = c1;
- }
- tmp = prev; prev = curr; curr = tmp; // swap
- }
- { // backtrack
- uint32_t max_x = 0;
- int which = 0;
- h = calloc(vpos, 1);
- for (x = 0, max = 0, max_x = 0; x < z; ++x)
- if (prev[x] > max) max = prev[x], max_x = x;
- for (i = vpos - 1, x = max_x; i >= 0; --i) {
- h[i] = which? (~x&1) : (x&1);
- which = b[i][x]? !which : which;
- x = b[i][x]? (~x&mask)>>1 : x>>1;
- }
- }
- // free
- for (i = 0; i < vpos; ++i) free(b[i]);
- free(f[0]); free(f[1]); free(b);
- return h;
-}
-
-// phase each fragment
-static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
-{
- khint_t k;
- uint64_t *pcnt;
- uint32_t *left, *rght, max;
- left = rght = 0; max = 0;
- pcnt = calloc(vpos, 8);
- for (k = 0; k < kh_end(hash); ++k) {
- if (kh_exist(hash, k)) {
- int i, c[2];
- frag_t *f = &kh_val(hash, k);
- if (f->vpos >= vpos) continue;
- // get the phase
- c[0] = c[1] = 0;
- for (i = 0; i < f->vlen; ++i) {
- if (f->seq[i] == 0) continue;
- ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
- }
- f->phase = c[0] > c[1]? 0 : 1;
- f->in = c[f->phase]; f->out = c[1 - f->phase];
- f->phased = f->in == f->out? 0 : 1;
- f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0;
- // fix chimera
- f->flip = 0;
- if (flip && c[0] >= 3 && c[1] >= 3) {
- int sum[2], m, mi, md;
- if (f->vlen > max) { // enlarge the array
- max = f->vlen;
- kroundup32(max);
- left = realloc(left, max * 4);
- rght = realloc(rght, max * 4);
- }
- for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
- if (f->seq[i]) {
- int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
- ++sum[c == path[f->vpos + i]? 0 : 1];
- }
- left[i] = sum[1]<<16 | sum[0];
- }
- for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
- if (f->seq[i]) {
- int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
- ++sum[c == path[f->vpos + i]? 0 : 1];
- }
- rght[i] = sum[1]<<16 | sum[0];
- }
- // find the best flip point
- for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
- int a[2];
- a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
- a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
- if (a[0] > a[1]) {
- if (a[0] > m) m = a[0], md = 0, mi = i;
- } else {
- if (a[1] > m) m = a[1], md = 1, mi = i;
- }
- }
- if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
- f->flip = 1;
- if (md == 0) { // flip the tail
- for (i = mi + 1; i < f->vlen; ++i)
- if (f->seq[i] == 1) f->seq[i] = 2;
- else if (f->seq[i] == 2) f->seq[i] = 1;
- } else { // flip the head
- for (i = 0; i <= mi; ++i)
- if (f->seq[i] == 1) f->seq[i] = 2;
- else if (f->seq[i] == 2) f->seq[i] = 1;
- }
- }
- }
- // update pcnt[]
- if (!f->single) {
- for (i = 0; i < f->vlen; ++i) {
- int c;
- if (f->seq[i] == 0) continue;
- c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
- if (c == path[f->vpos + i]) {
- if (f->phase == 0) ++pcnt[f->vpos + i];
- else pcnt[f->vpos + i] += 1ull<<32;
- } else {
- if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
- else pcnt[f->vpos + i] += 1ull<<48;
- }
- }
- }
- }
- }
- free(left); free(rght);
- return pcnt;
-}
-
-static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
-{
- int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
- uint64_t *list = 0;
- for (i = 0; i < vpos; ++i) {
- uint64_t x = pcnt[i];
- int c[4], pre = score, s;
- c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
- s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
- if (c[3] > c[2]) s += c[3] - c[2];
- if (c[1] > c[0]) s += c[1] - c[0];
- score += s;
- if (score < 0) score = 0;
- if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
- if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
- if (n == m) {
- m = m? m<<1 : 4;
- list = realloc(list, m * 8);
- }
- list[n++] = (uint64_t)beg<<32 | max_i;
- i = max_i; // reset i to max_i
- score = 0;
- } else if (score > max) max = score, max_i = i;
- if (score == 0) max = 0;
- }
- *_n = n;
- return list;
-}
-
-// trim heading and tailing ambiguous bases; mark deleted and remove sequence
-static int clean_seqs(int vpos, nseq_t *hash)
-{
- khint_t k;
- int ret = 0;
- for (k = 0; k < kh_end(hash); ++k) {
- if (kh_exist(hash, k)) {
- frag_t *f = &kh_val(hash, k);
- int beg, end, i;
- if (f->vpos >= vpos) {
- ret = 1;
- continue;
- }
- for (i = 0; i < f->vlen; ++i)
- if (f->seq[i] != 0) break;
- beg = i;
- for (i = f->vlen - 1; i >= 0; --i)
- if (f->seq[i] != 0) break;
- end = i + 1;
- if (end - beg <= 0) kh_del(64, hash, k);
- else {
- if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
- f->vpos += beg; f->vlen = end - beg;
- f->single = f->vlen == 1? 1 : 0;
- }
- }
- }
- return ret;
-}
-
-static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
-{
- int i, is_flip, drop_ambi;
- drop_ambi = g->flag & FLAG_DROP_AMBI;
- is_flip = (drand48() < 0.5);
- for (i = 0; i < g->n; ++i) {
- int end, which;
- uint64_t key;
- khint_t k;
- bam1_t *b = g->b[i];
- key = X31_hash_string(bam1_qname(b));
- end = bam_calend(&b->core, bam1_cigar(b));
- if (end > min_pos) break;
- k = kh_get(64, hash, key);
- if (k == kh_end(hash)) which = 3;
- else {
- frag_t *f = &kh_val(hash, k);
- if (f->ambig) which = drop_ambi? 2 : 3;
- else if (f->phased && f->flip) which = 2;
- else if (f->phased == 0) which = 3;
- else { // phased and not flipped
- char c = 'Y';
- which = f->phase;
- bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
- }
- if (which < 2 && is_flip) which = 1 - which; // increase the randomness
- }
- if (which == 3) which = (drand48() < 0.5);
- bam_write1(g->out[which], b);
- bam_destroy1(b);
- g->b[i] = 0;
- }
- memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
- g->n -= i;
-}
-
-static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
-{
- int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
- khint_t k;
- frag_t **seqs;
- int8_t *path, *sitemask;
- uint64_t *pcnt, *regmask;
-
- if (vpos == 0) return 0;
- i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
- min_pos = i? cns[vpos]>>32 : 0x7fffffff;
- if (vpos == 1) {
- printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
- printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
- "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
- for (k = 0; k < kh_end(hash); ++k) {
- if (kh_exist(hash, k)) {
- frag_t *f = &kh_val(hash, k);
- if (f->vpos) continue;
- f->flip = 0;
- if (f->seq[0] == 0) f->phased = 0;
- else f->phased = 1, f->phase = f->seq[0] - 1;
- }
- }
- dump_aln(g, min_pos, hash);
- ++g->vpos_shift;
- return 1;
- }
- { // phase
- int **cnt;
- uint64_t *mask;
- printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
- sitemask = calloc(vpos, 1);
- cnt = count_all(g->k, vpos, hash);
- path = dynaprog(g->k, vpos, cnt);
- for (i = 0; i < vpos; ++i) free(cnt[i]);
- free(cnt);
- pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
- mask = genmask(vpos, pcnt, &n_masked);
- regmask = calloc(n_masked, 8);
- for (i = 0; i < n_masked; ++i) {
- regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
- for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
- sitemask[j] = 1;
- }
- free(mask);
- if (g->flag & FLAG_FIX_CHIMERA) {
- free(pcnt);
- pcnt = fragphase(vpos, path, hash, 1);
- }
- }
- for (i = 0; i < n_masked; ++i)
- printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
- for (i = 0; i < vpos; ++i) {
- uint64_t x = pcnt[i];
- int8_t c[2];
- c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
- c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
- printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
- i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
- }
- free(path); free(pcnt); free(regmask); free(sitemask);
- seqs = calloc(n_seqs, sizeof(void*));
- for (k = 0, i = 0; k < kh_end(hash); ++k)
- if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
- seqs[i++] = &kh_val(hash, k);
- n_seqs = i;
- ks_introsort_rseq(n_seqs, seqs);
- for (i = 0; i < n_seqs; ++i) {
- frag_t *f = seqs[i];
- printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
- for (j = 0; j < f->vlen; ++j) {
- uint32_t c = cns[f->vpos + j];
- if (f->seq[j] == 0) putchar('N');
- else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
- }
- printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
- }
- free(seqs);
- printf("//\n");
- fflush(stdout);
- g->vpos_shift += vpos;
- dump_aln(g, min_pos, hash);
- return vpos;
-}
-
-static void update_vpos(int vpos, nseq_t *hash)
-{
- khint_t k;
- for (k = 0; k < kh_end(hash); ++k) {
- if (kh_exist(hash, k)) {
- frag_t *f = &kh_val(hash, k);
- if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
- else f->vpos -= vpos;
- }
- }
-}
-
-static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
-{
- return hash;
-}
-
-static int readaln(void *data, bam1_t *b)
-{
- phaseg_t *g = (phaseg_t*)data;
- int ret;
- ret = bam_read1(g->fp, b);
- if (ret < 0) return ret;
- if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
- if (g->n == g->m) {
- g->m = g->m? g->m<<1 : 16;
- g->b = realloc(g->b, g->m * sizeof(void*));
- }
- g->b[g->n++] = bam_dup1(b);
- }
- return ret;
-}
-
-static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
-{
- gzFile fp;
- kstream_t *ks;
- int ret, dret;
- kstring_t *str;
- khash_t(set64) *hash;
-
- hash = kh_init(set64);
- str = calloc(1, sizeof(kstring_t));
- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, str, &dret) >= 0) {
- int tid = bam_get_tid(h, str->s);
- if (tid >= 0 && dret != '\n') {
- if (ks_getuntil(ks, 0, str, &dret) >= 0) {
- uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
- kh_put(set64, hash, x, &ret);
- } else break;
- }
- if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
- if (dret < 0) break;
- }
- ks_destroy(ks);
- gzclose(fp);
- free(str->s); free(str);
- return hash;
-}
-
-static int gl2cns(float q[16])
-{
- int i, j, min_ij;
- float min, min2;
- min = min2 = 1e30; min_ij = -1;
- for (i = 0; i < 4; ++i) {
- for (j = i; j < 4; ++j) {
- if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
- else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
- }
- }
- return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
-}
-
-int main_phase(int argc, char *argv[])
-{
- extern void bam_init_header_hash(bam_header_t *header);
- int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
- const bam_pileup1_t *plp;
- bam_plp_t iter;
- bam_header_t *h;
- nseq_t *seqs;
- uint64_t *cns = 0;
- phaseg_t g;
- char *fn_list = 0;
- khash_t(set64) *set = 0;
- errmod_t *em;
- uint16_t *bases;
-
- memset(&g, 0, sizeof(phaseg_t));
- g.flag = FLAG_FIX_CHIMERA;
- g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
- while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
- switch (c) {
- case 'D': g.max_depth = atoi(optarg); break;
- case 'q': g.min_varLOD = atoi(optarg); break;
- case 'Q': g.min_baseQ = atoi(optarg); break;
- case 'k': g.k = atoi(optarg); break;
- case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
- case 'e': g.flag |= FLAG_LIST_EXCL; break;
- case 'A': g.flag |= FLAG_DROP_AMBI; break;
- case 'b': g.pre = strdup(optarg); break;
- case 'l': fn_list = strdup(optarg); break;
- }
- }
- if (argc == optind) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n");
- fprintf(stderr, "Options: -k INT block length [%d]\n", g.k);
- fprintf(stderr, " -b STR prefix of BAMs to output [null]\n");
- fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
- fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
- fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth);
-// fprintf(stderr, " -l FILE list of sites to phase [null]\n");
- fprintf(stderr, " -F do not attempt to fix chimeras\n");
- fprintf(stderr, " -A drop reads with ambiguous phase\n");
-// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
- fprintf(stderr, "\n");
- return 1;
- }
- g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- h = bam_header_read(g.fp);
- if (fn_list) { // read the list of sites to phase
- bam_init_header_hash(h);
- set = loadpos(fn_list, h);
- free(fn_list);
- } else g.flag &= ~FLAG_LIST_EXCL;
- if (g.pre) { // open BAMs to write
- char *s = malloc(strlen(g.pre) + 20);
- strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
- strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
- strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
- for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
- free(s);
- }
-
- iter = bam_plp_init(readaln, &g);
- g.vpos_shift = 0;
- seqs = kh_init(64);
- em = errmod_init(1. - 0.83);
- bases = calloc(g.max_depth, 2);
- printf("CC\n");
- printf("CC\tDescriptions:\nCC\n");
- printf("CC\t CC comments\n");
- printf("CC\t PS start of a phase set\n");
- printf("CC\t FL filtered region\n");
- printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
- printf("CC\t EV supporting reads; SAM format\n");
- printf("CC\t // end of a phase set\nCC\n");
- printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
- printf("CC\t PS chr phaseSetStart phaseSetEnd\n");
- printf("CC\t FL chr filterStart filterEnd\n");
- printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
- printf("CC\nCC\n");
- fflush(stdout);
- while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
- int i, k, c, tmp, dophase = 1, in_set = 0;
- float q[16];
- if (tid < 0) break;
- if (tid != lasttid) { // change of chromosome
- g.vpos_shift = 0;
- if (lasttid >= 0) {
- seqs = shrink_hash(seqs);
- phase(&g, h->target_name[lasttid], vpos, cns, seqs);
- update_vpos(0x7fffffff, seqs);
- }
- lasttid = tid;
- vpos = 0;
- }
- if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
- if (n > g.max_depth) continue; // do not proceed if the depth is too high
- // fill the bases array and check if there is a variant
- for (i = k = 0; i < n; ++i) {
- const bam_pileup1_t *p = plp + i;
- uint8_t *seq;
- int q, baseQ, b;
- if (p->is_del || p->is_refskip) continue;
- baseQ = bam1_qual(p->b)[p->qpos];
- if (baseQ < g.min_baseQ) continue;
- seq = bam1_seq(p->b);
- b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
- if (b > 3) continue;
- q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
- if (q < 4) q = 4;
- if (q > 63) q = 63;
- bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
- }
- if (k == 0) continue;
- errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
- c = gl2cns(q); // get the consensus
- // tell if to proceed
- if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
- if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
- // add the variant
- if (vpos == max_vpos) {
- max_vpos = max_vpos? max_vpos<<1 : 128;
- cns = realloc(cns, max_vpos * 8);
- }
- cns[vpos] = (uint64_t)pos<<32 | c;
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = plp + i;
- uint64_t key;
- khint_t k;
- uint8_t *seq = bam1_seq(p->b);
- frag_t *f;
- if (p->is_del || p->is_refskip) continue;
- if (p->b->core.qual == 0) continue;
- // get the base code
- c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
- if (c == (cns[vpos]&3)) c = 1;
- else if (c == (cns[vpos]>>16&3)) c = 2;
- else c = 0;
- // write to seqs
- key = X31_hash_string(bam1_qname(p->b));
- k = kh_put(64, seqs, key, &tmp);
- f = &kh_val(seqs, k);
- if (tmp == 0) { // present in the hash table
- if (vpos - f->vpos + 1 < MAX_VARS) {
- f->vlen = vpos - f->vpos + 1;
- f->seq[f->vlen-1] = c;
- f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
- }
- dophase = 0;
- } else { // absent
- memset(f->seq, 0, MAX_VARS);
- f->beg = p->b->core.pos;
- f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
- f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0;
- }
- }
- if (dophase) {
- seqs = shrink_hash(seqs);
- phase(&g, h->target_name[tid], vpos, cns, seqs);
- update_vpos(vpos, seqs);
- cns[0] = cns[vpos];
- vpos = 0;
- }
- ++vpos;
- }
- if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
- bam_header_destroy(h);
- bam_plp_destroy(iter);
- bam_close(g.fp);
- kh_destroy(64, seqs);
- kh_destroy(set64, set);
- free(cns);
- errmod_destroy(em);
- free(bases);
- if (g.pre) {
- for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
- free(g.pre); free(g.b);
- }
- return 0;
-}
diff --git a/sam/razf.c b/sam/razf.c
deleted file mode 100644
index e7499f9..0000000
--- a/sam/razf.c
+++ /dev/null
@@ -1,853 +0,0 @@
-/*
- * RAZF : Random Access compressed(Z) File
- * Version: 1.0
- * Release Date: 2008-10-27
- *
- * Copyright 2008, Jue Ruan <***@gmail.com>, Heng Li <***@sanger.ac.uk>
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _NO_RAZF
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include "razf.h"
-
-
-#if ZLIB_VERNUM < 0x1221
-struct _gz_header_s {
- int text;
- uLong time;
- int xflags;
- int os;
- Bytef *extra;
- uInt extra_len;
- uInt extra_max;
- Bytef *name;
- uInt name_max;
- Bytef *comment;
- uInt comm_max;
- int hcrc;
- int done;
-};
-#warning "zlib < 1.2.2.1; RAZF writing is disabled."
-#endif
-
-#define DEF_MEM_LEVEL 8
-
-static inline uint32_t byte_swap_4(uint32_t v){
- v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
- return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
-}
-
-static inline uint64_t byte_swap_8(uint64_t v){
- v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
- v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
- return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
-}
-
-static inline int is_big_endian(){
- int x = 0x01;
- char *c = (char*)&x;
- return (c[0] != 0x01);
-}
-
-#ifndef _RZ_READONLY
-static void add_zindex(RAZF *rz, int64_t in, int64_t out){
- if(rz->index->size == rz->index->cap){
- rz->index->cap = rz->index->cap * 1.5 + 2;
- rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
- rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
- }
- if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
- rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
- rz->index->size ++;
-}
-
-static void save_zindex(RAZF *rz, int fd){
- int32_t i, v32;
- int is_be;
- is_be = is_big_endian();
- if(is_be) write(fd, &rz->index->size, sizeof(int));
- else {
- v32 = byte_swap_4((uint32_t)rz->index->size);
- write(fd, &v32, sizeof(uint32_t));
- }
- v32 = rz->index->size / RZ_BIN_SIZE + 1;
- if(!is_be){
- for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
- for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
- }
- write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
- write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
-}
-#endif
-
-#ifdef _USE_KNETFILE
-static void load_zindex(RAZF *rz, knetFile *fp){
-#else
-static void load_zindex(RAZF *rz, int fd){
-#endif
- int32_t i, v32;
- int is_be;
- if(!rz->load_index) return;
- if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
- is_be = is_big_endian();
-#ifdef _USE_KNETFILE
- knet_read(fp, &rz->index->size, sizeof(int));
-#else
- read(fd, &rz->index->size, sizeof(int));
-#endif
- if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
- rz->index->cap = rz->index->size;
- v32 = rz->index->size / RZ_BIN_SIZE + 1;
- rz->index->bin_offsets = malloc(sizeof(int64_t) * v32);
-#ifdef _USE_KNETFILE
- knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32);
-#else
- read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
-#endif
- rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
-#ifdef _USE_KNETFILE
- knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size);
-#else
- read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
-#endif
- if(!is_be){
- for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
- for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
- }
-}
-
-#ifdef _RZ_READONLY
-static RAZF* razf_open_w(int fd)
-{
- fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n");
- return 0;
-}
-#else
-static RAZF* razf_open_w(int fd){
- RAZF *rz;
-#ifdef _WIN32
- setmode(fd, O_BINARY);
-#endif
- rz = calloc(1, sizeof(RAZF));
- rz->mode = 'w';
-#ifdef _USE_KNETFILE
- rz->x.fpw = fd;
-#else
- rz->filedes = fd;
-#endif
- rz->stream = calloc(sizeof(z_stream), 1);
- rz->inbuf = malloc(RZ_BUFFER_SIZE);
- rz->outbuf = malloc(RZ_BUFFER_SIZE);
- rz->index = calloc(sizeof(ZBlockIndex), 1);
- deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
- rz->stream->avail_out = RZ_BUFFER_SIZE;
- rz->stream->next_out = rz->outbuf;
- rz->header = calloc(sizeof(gz_header), 1);
- rz->header->os = 0x03; //Unix
- rz->header->text = 0;
- rz->header->time = 0;
- rz->header->extra = malloc(7);
- strncpy((char*)rz->header->extra, "RAZF", 4);
- rz->header->extra[4] = 1; // obsolete field
- // block size = RZ_BLOCK_SIZE, Big-Endian
- rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
- rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
- rz->header->extra_len = 7;
- rz->header->name = rz->header->comment = 0;
- rz->header->hcrc = 0;
- deflateSetHeader(rz->stream, rz->header);
- rz->block_pos = rz->block_off = 0;
- return rz;
-}
-
-static void _razf_write(RAZF* rz, const void *data, int size){
- int tout;
- rz->stream->avail_in = size;
- rz->stream->next_in = (void*)data;
- while(1){
- tout = rz->stream->avail_out;
- deflate(rz->stream, Z_NO_FLUSH);
- rz->out += tout - rz->stream->avail_out;
- if(rz->stream->avail_out) break;
-#ifdef _USE_KNETFILE
- write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#else
- write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#endif
- rz->stream->avail_out = RZ_BUFFER_SIZE;
- rz->stream->next_out = rz->outbuf;
- if(rz->stream->avail_in == 0) break;
- };
- rz->in += size - rz->stream->avail_in;
- rz->block_off += size - rz->stream->avail_in;
-}
-
-static void razf_flush(RAZF *rz){
- uint32_t tout;
- if(rz->buf_len){
- _razf_write(rz, rz->inbuf, rz->buf_len);
- rz->buf_off = rz->buf_len = 0;
- }
- if(rz->stream->avail_out){
-#ifdef _USE_KNETFILE
- write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#else
- write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#endif
- rz->stream->avail_out = RZ_BUFFER_SIZE;
- rz->stream->next_out = rz->outbuf;
- }
- while(1){
- tout = rz->stream->avail_out;
- deflate(rz->stream, Z_FULL_FLUSH);
- rz->out += tout - rz->stream->avail_out;
- if(rz->stream->avail_out == 0){
-#ifdef _USE_KNETFILE
- write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#else
- write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#endif
- rz->stream->avail_out = RZ_BUFFER_SIZE;
- rz->stream->next_out = rz->outbuf;
- } else break;
- }
- rz->block_pos = rz->out;
- rz->block_off = 0;
-}
-
-static void razf_end_flush(RAZF *rz){
- uint32_t tout;
- if(rz->buf_len){
- _razf_write(rz, rz->inbuf, rz->buf_len);
- rz->buf_off = rz->buf_len = 0;
- }
- while(1){
- tout = rz->stream->avail_out;
- deflate(rz->stream, Z_FINISH);
- rz->out += tout - rz->stream->avail_out;
- if(rz->stream->avail_out < RZ_BUFFER_SIZE){
-#ifdef _USE_KNETFILE
- write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#else
- write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
-#endif
- rz->stream->avail_out = RZ_BUFFER_SIZE;
- rz->stream->next_out = rz->outbuf;
- } else break;
- }
-}
-
-static void _razf_buffered_write(RAZF *rz, const void *data, int size){
- int i, n;
- while(1){
- if(rz->buf_len == RZ_BUFFER_SIZE){
- _razf_write(rz, rz->inbuf, rz->buf_len);
- rz->buf_len = 0;
- }
- if(size + rz->buf_len < RZ_BUFFER_SIZE){
- for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
- rz->buf_len += size;
- return;
- } else {
- n = RZ_BUFFER_SIZE - rz->buf_len;
- for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
- size -= n;
- data += n;
- rz->buf_len += n;
- }
- }
-}
-
-int razf_write(RAZF* rz, const void *data, int size){
- int ori_size, n;
- int64_t next_block;
- ori_size = size;
- next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
- while(rz->in + rz->buf_len + size >= next_block){
- n = next_block - rz->in - rz->buf_len;
- _razf_buffered_write(rz, data, n);
- data += n;
- size -= n;
- razf_flush(rz);
- add_zindex(rz, rz->in, rz->out);
- next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
- }
- _razf_buffered_write(rz, data, size);
- return ori_size;
-}
-#endif
-
-/* gzip flag byte */
-#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */
-#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */
-#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
-#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
-#define COMMENT 0x10 /* bit 4 set: file comment present */
-#define RESERVED 0xE0 /* bits 5..7: reserved */
-
-static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
- int method, flags, n, len;
- if(size < 2) return 0;
- if(data[0] != 0x1f || data[1] != 0x8b) return 0;
- if(size < 4) return 0;
- method = data[2];
- flags = data[3];
- if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
- n = 4 + 6; // Skip 6 bytes
- *extra_off = n + 2;
- *extra_len = 0;
- if(flags & EXTRA_FIELD){
- if(size < n + 2) return 0;
- len = ((int)data[n + 1] << 8) | data[n];
- n += 2;
- *extra_off = n;
- while(len){
- if(n >= size) return 0;
- n ++;
- len --;
- }
- *extra_len = n - (*extra_off);
- }
- if(flags & ORIG_NAME) while(n < size && data[n++]);
- if(flags & COMMENT) while(n < size && data[n++]);
- if(flags & HEAD_CRC){
- if(n + 2 > size) return 0;
- n += 2;
- }
- return n;
-}
-
-#ifdef _USE_KNETFILE
-static RAZF* razf_open_r(knetFile *fp, int _load_index){
-#else
-static RAZF* razf_open_r(int fd, int _load_index){
-#endif
- RAZF *rz;
- int ext_off, ext_len;
- int n, is_be, ret;
- int64_t end;
- unsigned char c[] = "RAZF";
- rz = calloc(1, sizeof(RAZF));
- rz->mode = 'r';
-#ifdef _USE_KNETFILE
- rz->x.fpr = fp;
-#else
-#ifdef _WIN32
- setmode(fd, O_BINARY);
-#endif
- rz->filedes = fd;
-#endif
- rz->stream = calloc(sizeof(z_stream), 1);
- rz->inbuf = malloc(RZ_BUFFER_SIZE);
- rz->outbuf = malloc(RZ_BUFFER_SIZE);
- rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
-#ifdef _USE_KNETFILE
- n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
-#else
- n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
-#endif
- ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
- if(ret == 0){
- PLAIN_FILE:
- rz->in = n;
- rz->file_type = FILE_TYPE_PLAIN;
- memcpy(rz->outbuf, rz->inbuf, n);
- rz->buf_len = n;
- free(rz->stream);
- rz->stream = NULL;
- return rz;
- }
- rz->header_size = ret;
- ret = inflateInit2(rz->stream, -WINDOW_BITS);
- if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
- rz->stream->avail_in = n - rz->header_size;
- rz->stream->next_in = rz->inbuf + rz->header_size;
- rz->stream->avail_out = RZ_BUFFER_SIZE;
- rz->stream->next_out = rz->outbuf;
- rz->file_type = FILE_TYPE_GZ;
- rz->in = rz->header_size;
- rz->block_pos = rz->header_size;
- rz->next_block_pos = rz->header_size;
- rz->block_off = 0;
- if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
- if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
- fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
- return rz;
- }
- rz->load_index = _load_index;
- rz->file_type = FILE_TYPE_RZ;
-#ifdef _USE_KNETFILE
- if(knet_seek(fp, -16, SEEK_END) == -1){
-#else
- if(lseek(fd, -16, SEEK_END) == -1){
-#endif
- UNSEEKABLE:
- rz->seekable = 0;
- rz->index = NULL;
- rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
- } else {
- is_be = is_big_endian();
- rz->seekable = 1;
-#ifdef _USE_KNETFILE
- knet_read(fp, &end, sizeof(int64_t));
-#else
- read(fd, &end, sizeof(int64_t));
-#endif
- if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
- else rz->src_end = end;
-
-#ifdef _USE_KNETFILE
- knet_read(fp, &end, sizeof(int64_t));
-#else
- read(fd, &end, sizeof(int64_t));
-#endif
- if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
- else rz->end = end;
- if(n > rz->end){
- rz->stream->avail_in -= n - rz->end;
- n = rz->end;
- }
- if(rz->end > rz->src_end){
-#ifdef _USE_KNETFILE
- knet_seek(fp, rz->in, SEEK_SET);
-#else
- lseek(fd, rz->in, SEEK_SET);
-#endif
- goto UNSEEKABLE;
- }
-#ifdef _USE_KNETFILE
- knet_seek(fp, rz->end, SEEK_SET);
- if(knet_tell(fp) != rz->end){
- knet_seek(fp, rz->in, SEEK_SET);
-#else
- if(lseek(fd, rz->end, SEEK_SET) != rz->end){
- lseek(fd, rz->in, SEEK_SET);
-#endif
- goto UNSEEKABLE;
- }
-#ifdef _USE_KNETFILE
- load_zindex(rz, fp);
- knet_seek(fp, n, SEEK_SET);
-#else
- load_zindex(rz, fd);
- lseek(fd, n, SEEK_SET);
-#endif
- }
- return rz;
-}
-
-#ifdef _USE_KNETFILE
-RAZF* razf_dopen(int fd, const char *mode){
- if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n");
- else if(strstr(mode, "w")) return razf_open_w(fd);
- return NULL;
-}
-
-RAZF* razf_dopen2(int fd, const char *mode)
-{
- fprintf(stderr,"[razf_dopen2] implement me\n");
- return NULL;
-}
-#else
-RAZF* razf_dopen(int fd, const char *mode){
- if(strstr(mode, "r")) return razf_open_r(fd, 1);
- else if(strstr(mode, "w")) return razf_open_w(fd);
- else return NULL;
-}
-
-RAZF* razf_dopen2(int fd, const char *mode)
-{
- if(strstr(mode, "r")) return razf_open_r(fd, 0);
- else if(strstr(mode, "w")) return razf_open_w(fd);
- else return NULL;
-}
-#endif
-
-static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
- int fd;
- RAZF *rz;
- if(strstr(mode, "r")){
-#ifdef _USE_KNETFILE
- knetFile *fd = knet_open(filename, "r");
- if (fd == 0) {
- fprintf(stderr, "[_razf_open] fail to open %s\n", filename);
- return NULL;
- }
-#else
-#ifdef _WIN32
- fd = open(filename, O_RDONLY | O_BINARY);
-#else
- fd = open(filename, O_RDONLY);
-#endif
-#endif
- if(fd < 0) return NULL;
- rz = razf_open_r(fd, _load_index);
- } else if(strstr(mode, "w")){
-#ifdef _WIN32
- fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
-#else
- fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
-#endif
- if(fd < 0) return NULL;
- rz = razf_open_w(fd);
- } else return NULL;
- return rz;
-}
-
-RAZF* razf_open(const char *filename, const char *mode){
- return _razf_open(filename, mode, 1);
-}
-
-RAZF* razf_open2(const char *filename, const char *mode){
- return _razf_open(filename, mode, 0);
-}
-
-int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
- int64_t n;
- if(rz->mode != 'r' && rz->mode != 'R') return 0;
- switch(rz->file_type){
- case FILE_TYPE_PLAIN:
- if(rz->end == 0x7fffffffffffffffLL){
-#ifdef _USE_KNETFILE
- if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0;
- n = knet_tell(rz->x.fpr);
- knet_seek(rz->x.fpr, 0, SEEK_END);
- rz->end = knet_tell(rz->x.fpr);
- knet_seek(rz->x.fpr, n, SEEK_SET);
-#else
- if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
- rz->end = lseek(rz->filedes, 0, SEEK_END);
- lseek(rz->filedes, n, SEEK_SET);
-#endif
- }
- *u_size = *c_size = rz->end;
- return 1;
- case FILE_TYPE_GZ:
- return 0;
- case FILE_TYPE_RZ:
- if(rz->src_end == rz->end) return 0;
- *u_size = rz->src_end;
- *c_size = rz->end;
- return 1;
- default:
- return 0;
- }
-}
-
-static int _razf_read(RAZF* rz, void *data, int size){
- int ret, tin;
- if(rz->z_eof || rz->z_err) return 0;
- if (rz->file_type == FILE_TYPE_PLAIN) {
-#ifdef _USE_KNETFILE
- ret = knet_read(rz->x.fpr, data, size);
-#else
- ret = read(rz->filedes, data, size);
-#endif
- if (ret == 0) rz->z_eof = 1;
- return ret;
- }
- rz->stream->avail_out = size;
- rz->stream->next_out = data;
- while(rz->stream->avail_out){
- if(rz->stream->avail_in == 0){
- if(rz->in >= rz->end){ rz->z_eof = 1; break; }
- if(rz->end - rz->in < RZ_BUFFER_SIZE){
-#ifdef _USE_KNETFILE
- rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in);
-#else
- rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
-#endif
- } else {
-#ifdef _USE_KNETFILE
- rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
-#else
- rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
-#endif
- }
- if(rz->stream->avail_in == 0){
- rz->z_eof = 1;
- break;
- }
- rz->stream->next_in = rz->inbuf;
- }
- tin = rz->stream->avail_in;
- ret = inflate(rz->stream, Z_BLOCK);
- rz->in += tin - rz->stream->avail_in;
- if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
- fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__);
- rz->z_err = 1;
- break;
- }
- if(ret == Z_STREAM_END){
- rz->z_eof = 1;
- break;
- }
- if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
- rz->buf_flush = 1;
- rz->next_block_pos = rz->in;
- break;
- }
- }
- return size - rz->stream->avail_out;
-}
-
-int razf_read(RAZF *rz, void *data, int size){
- int ori_size, i;
- ori_size = size;
- while(size > 0){
- if(rz->buf_len){
- if(size < rz->buf_len){
- for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
- rz->buf_off += size;
- rz->buf_len -= size;
- data += size;
- rz->block_off += size;
- size = 0;
- break;
- } else {
- for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
- data += rz->buf_len;
- size -= rz->buf_len;
- rz->block_off += rz->buf_len;
- rz->buf_off = 0;
- rz->buf_len = 0;
- if(rz->buf_flush){
- rz->block_pos = rz->next_block_pos;
- rz->block_off = 0;
- rz->buf_flush = 0;
- }
- }
- } else if(rz->buf_flush){
- rz->block_pos = rz->next_block_pos;
- rz->block_off = 0;
- rz->buf_flush = 0;
- }
- if(rz->buf_flush) continue;
- rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
- if(rz->z_eof && rz->buf_len == 0) break;
- }
- rz->out += ori_size - size;
- return ori_size - size;
-}
-
-int razf_skip(RAZF* rz, int size){
- int ori_size;
- ori_size = size;
- while(size > 0){
- if(rz->buf_len){
- if(size < rz->buf_len){
- rz->buf_off += size;
- rz->buf_len -= size;
- rz->block_off += size;
- size = 0;
- break;
- } else {
- size -= rz->buf_len;
- rz->buf_off = 0;
- rz->buf_len = 0;
- rz->block_off += rz->buf_len;
- if(rz->buf_flush){
- rz->block_pos = rz->next_block_pos;
- rz->block_off = 0;
- rz->buf_flush = 0;
- }
- }
- } else if(rz->buf_flush){
- rz->block_pos = rz->next_block_pos;
- rz->block_off = 0;
- rz->buf_flush = 0;
- }
- if(rz->buf_flush) continue;
- rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
- if(rz->z_eof || rz->z_err) break;
- }
- rz->out += ori_size - size;
- return ori_size - size;
-}
-
-static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
-#ifdef _USE_KNETFILE
- knet_seek(rz->x.fpr, in, SEEK_SET);
-#else
- lseek(rz->filedes, in, SEEK_SET);
-#endif
- rz->in = in;
- rz->out = out;
- rz->block_pos = in;
- rz->next_block_pos = in;
- rz->block_off = 0;
- rz->buf_flush = 0;
- rz->z_eof = rz->z_err = 0;
- inflateReset(rz->stream);
- rz->stream->avail_in = 0;
- rz->buf_off = rz->buf_len = 0;
-}
-
-int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
- int64_t pos;
- rz->z_eof = 0;
- if(rz->file_type == FILE_TYPE_PLAIN){
- rz->buf_off = rz->buf_len = 0;
- pos = block_start + block_offset;
-#ifdef _USE_KNETFILE
- knet_seek(rz->x.fpr, pos, SEEK_SET);
- pos = knet_tell(rz->x.fpr);
-#else
- pos = lseek(rz->filedes, pos, SEEK_SET);
-#endif
- rz->out = rz->in = pos;
- return pos;
- }
- if(block_start == rz->block_pos && block_offset >= rz->block_off) {
- block_offset -= rz->block_off;
- goto SKIP; // Needn't reset inflate
- }
- if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
- _razf_reset_read(rz, block_start, 0);
- SKIP:
- if(block_offset) razf_skip(rz, block_offset);
- return rz->block_off;
-}
-
-int64_t razf_seek(RAZF* rz, int64_t pos, int where){
- int64_t idx;
- int64_t seek_pos, new_out;
- rz->z_eof = 0;
- if (where == SEEK_CUR) pos += rz->out;
- else if (where == SEEK_END) pos += rz->src_end;
- if(rz->file_type == FILE_TYPE_PLAIN){
-#ifdef _USE_KNETFILE
- knet_seek(rz->x.fpr, pos, SEEK_SET);
- seek_pos = knet_tell(rz->x.fpr);
-#else
- seek_pos = lseek(rz->filedes, pos, SEEK_SET);
-#endif
- rz->buf_off = rz->buf_len = 0;
- rz->out = rz->in = seek_pos;
- return seek_pos;
- } else if(rz->file_type == FILE_TYPE_GZ){
- if(pos >= rz->out) goto SKIP;
- return rz->out;
- }
- if(pos == rz->out) return pos;
- if(pos > rz->src_end) return rz->out;
- if(!rz->seekable || !rz->load_index){
- if(pos >= rz->out) goto SKIP;
- }
- idx = pos / RZ_BLOCK_SIZE - 1;
- seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
- new_out = (idx + 1) * RZ_BLOCK_SIZE;
- if(pos > rz->out && new_out <= rz->out) goto SKIP;
- _razf_reset_read(rz, seek_pos, new_out);
- SKIP:
- razf_skip(rz, (int)(pos - rz->out));
- return rz->out;
-}
-
-uint64_t razf_tell2(RAZF *rz)
-{
- /*
- if (rz->load_index) {
- int64_t idx, seek_pos;
- idx = rz->out / RZ_BLOCK_SIZE - 1;
- seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
- if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
- fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
- (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
- }
- */
- return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
-}
-
-int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
-{
- if (where != SEEK_SET) return -1;
- return razf_jump(rz, voffset>>16, voffset&0xffff);
-}
-
-void razf_close(RAZF *rz){
- if(rz->mode == 'w'){
-#ifndef _RZ_READONLY
- razf_end_flush(rz);
- deflateEnd(rz->stream);
-#ifdef _USE_KNETFILE
- save_zindex(rz, rz->x.fpw);
- if(is_big_endian()){
- write(rz->x.fpw, &rz->in, sizeof(int64_t));
- write(rz->x.fpw, &rz->out, sizeof(int64_t));
- } else {
- uint64_t v64 = byte_swap_8((uint64_t)rz->in);
- write(rz->x.fpw, &v64, sizeof(int64_t));
- v64 = byte_swap_8((uint64_t)rz->out);
- write(rz->x.fpw, &v64, sizeof(int64_t));
- }
-#else
- save_zindex(rz, rz->filedes);
- if(is_big_endian()){
- write(rz->filedes, &rz->in, sizeof(int64_t));
- write(rz->filedes, &rz->out, sizeof(int64_t));
- } else {
- uint64_t v64 = byte_swap_8((uint64_t)rz->in);
- write(rz->filedes, &v64, sizeof(int64_t));
- v64 = byte_swap_8((uint64_t)rz->out);
- write(rz->filedes, &v64, sizeof(int64_t));
- }
-#endif
-#endif
- } else if(rz->mode == 'r'){
- if(rz->stream) inflateEnd(rz->stream);
- }
- if(rz->inbuf) free(rz->inbuf);
- if(rz->outbuf) free(rz->outbuf);
- if(rz->header){
- free(rz->header->extra);
- free(rz->header->name);
- free(rz->header->comment);
- free(rz->header);
- }
- if(rz->index){
- free(rz->index->bin_offsets);
- free(rz->index->cell_offsets);
- free(rz->index);
- }
- free(rz->stream);
-#ifdef _USE_KNETFILE
- if (rz->mode == 'r')
- knet_close(rz->x.fpr);
- if (rz->mode == 'w')
- close(rz->x.fpw);
-#else
- close(rz->filedes);
-#endif
- free(rz);
-}
-
-#endif
diff --git a/sam/razf.h b/sam/razf.h
deleted file mode 100644
index 60a0c96..0000000
--- a/sam/razf.h
+++ /dev/null
@@ -1,134 +0,0 @@
- /*-
- * RAZF : Random Access compressed(Z) File
- * Version: 1.0
- * Release Date: 2008-10-27
- *
- * Copyright 2008, Jue Ruan <***@gmail.com>, Heng Li <***@sanger.ac.uk>
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-#ifndef __RAZF_RJ_H
-#define __RAZF_RJ_H
-
-#include <stdint.h>
-#include <stdio.h>
-#include "zlib.h"
-
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-
-#if ZLIB_VERNUM < 0x1221
-#define _RZ_READONLY
-struct _gz_header_s;
-typedef struct _gz_header_s _gz_header;
-#define gz_header _gz_header
-#endif
-
-#define WINDOW_BITS 15
-
-#ifndef RZ_BLOCK_SIZE
-#define RZ_BLOCK_SIZE (1<<WINDOW_BITS)
-#endif
-
-#ifndef RZ_BUFFER_SIZE
-#define RZ_BUFFER_SIZE 4096
-#endif
-
-#ifndef RZ_COMPRESS_LEVEL
-#define RZ_COMPRESS_LEVEL 6
-#endif
-
-#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE)
-
-typedef struct {
- uint32_t *cell_offsets; // i
- int64_t *bin_offsets; // i / BIN_SIZE
- int size;
- int cap;
-} ZBlockIndex;
-/* When storing index, output bytes in Big-Endian everywhere */
-
-#define FILE_TYPE_RZ 1
-#define FILE_TYPE_PLAIN 2
-#define FILE_TYPE_GZ 3
-
-typedef struct RandomAccessZFile {
- char mode; /* 'w' : write mode; 'r' : read mode */
- int file_type;
- /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */
-#ifdef _USE_KNETFILE
- union {
- knetFile *fpr;
- int fpw;
- } x;
-#else
- int filedes; /* the file descriptor */
-#endif
- z_stream *stream;
- ZBlockIndex *index;
- int64_t in, out, end, src_end;
- /* in: n bytes total in; out: n bytes total out; */
- /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */
- int buf_flush; // buffer should be flush, suspend inflate util buffer is empty
- int64_t block_pos, block_off, next_block_pos;
- /* block_pos: the start postiion of current block in compressed file */
- /* block_off: tell how many bytes have been read from current block */
- void *inbuf, *outbuf;
- int header_size;
- gz_header *header;
- /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */
- int buf_off, buf_len;
- int z_err, z_eof;
- int seekable;
- /* Indice where the source is seekable */
- int load_index;
- /* set has_index to 0 in mode 'w', then index will be discarded */
-} RAZF;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- RAZF* razf_dopen(int data_fd, const char *mode);
- RAZF *razf_open(const char *fn, const char *mode);
- int razf_write(RAZF* rz, const void *data, int size);
- int razf_read(RAZF* rz, void *data, int size);
- int64_t razf_seek(RAZF* rz, int64_t pos, int where);
- void razf_close(RAZF* rz);
-
-#define razf_tell(rz) ((rz)->out)
-
- RAZF* razf_open2(const char *filename, const char *mode);
- RAZF* razf_dopen2(int fd, const char *mode);
- uint64_t razf_tell2(RAZF *rz);
- int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/razip.c b/sam/razip.c
deleted file mode 100644
index 825e732..0000000
--- a/sam/razip.c
+++ /dev/null
@@ -1,141 +0,0 @@
-#include <stdio.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include "razf.h"
-
-#define WINDOW_SIZE 4096
-
-static int razf_main_usage()
-{
- printf("\n");
- printf("Usage: razip [options] [file] ...\n\n");
- printf("Options: -c write on standard output, keep original files unchanged\n");
- printf(" -d decompress\n");
- printf(" -l list compressed file contents\n");
- printf(" -b INT decompress at INT position in the uncompressed file\n");
- printf(" -s INT decompress INT bytes in the uncompressed file\n");
- printf(" -h give this help\n");
- printf("\n");
- return 0;
-}
-
-static int write_open(const char *fn, int is_forced)
-{
- int fd = -1;
- char c;
- if (!is_forced) {
- if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
- printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn);
- scanf("%c", &c);
- if (c != 'Y' && c != 'y') {
- printf("razip: not overwritten\n");
- exit(1);
- }
- }
- }
- if (fd < 0) {
- if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
- fprintf(stderr, "razip: %s: Fail to write\n", fn);
- exit(1);
- }
- }
- return fd;
-}
-
-int main(int argc, char **argv)
-{
- int c, compress, pstdout, is_forced;
- RAZF *rz;
- void *buffer;
- long start, end, size;
-
- compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
- while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){
- switch(c){
- case 'h': return razf_main_usage();
- case 'd': compress = 0; break;
- case 'c': pstdout = 1; break;
- case 'l': compress = 2; break;
- case 'b': start = atol(optarg); break;
- case 's': size = atol(optarg); break;
- case 'f': is_forced = 1; break;
- }
- }
- if (size >= 0) end = start + size;
- if(end >= 0 && end < start){
- fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
- return 1;
- }
- if(compress == 1){
- int f_src, f_dst = -1;
- if(argc > optind){
- if((f_src = open(argv[optind], O_RDONLY)) < 0){
- fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
- return 1;
- }
- if(pstdout){
- f_dst = fileno(stdout);
- } else {
- char *name = malloc(sizeof(strlen(argv[optind]) + 5));
- strcpy(name, argv[optind]);
- strcat(name, ".rz");
- f_dst = write_open(name, is_forced);
- if (f_dst < 0) return 1;
- free(name);
- }
- } else if(pstdout){
- f_src = fileno(stdin);
- f_dst = fileno(stdout);
- } else return razf_main_usage();
- rz = razf_dopen(f_dst, "w");
- buffer = malloc(WINDOW_SIZE);
- while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
- razf_close(rz); // f_dst will be closed here
- if (argc > optind && !pstdout) unlink(argv[optind]);
- free(buffer);
- close(f_src);
- return 0;
- } else {
- if(argc <= optind) return razf_main_usage();
- if(compress == 2){
- rz = razf_open(argv[optind], "r");
- if(rz->file_type == FILE_TYPE_RZ) {
- printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
- printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
- argv[optind]);
- } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
- } else {
- int f_dst;
- if (argc > optind && !pstdout) {
- char *name;
- if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
- printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
- return 1;
- }
- name = strdup(argv[optind]);
- name[strlen(name) - 3] = '\0';
- f_dst = write_open(name, is_forced);
- free(name);
- } else f_dst = fileno(stdout);
- rz = razf_open(argv[optind], "r");
- buffer = malloc(WINDOW_SIZE);
- razf_seek(rz, start, SEEK_SET);
- while(1){
- if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
- else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
- if(c <= 0) break;
- start += c;
- write(f_dst, buffer, c);
- if(end >= 0 && start >= end) break;
- }
- free(buffer);
- if (!pstdout) unlink(argv[optind]);
- }
- razf_close(rz);
- return 0;
- }
-}
-
diff --git a/sam/sam.c b/sam/sam.c
deleted file mode 100644
index fa11df6..0000000
--- a/sam/sam.c
+++ /dev/null
@@ -1,186 +0,0 @@
-#include <string.h>
-#include <unistd.h>
-#include "faidx.h"
-#include "sam.h"
-
-#define TYPE_BAM 1
-#define TYPE_READ 2
-
-bam_header_t *bam_header_dup(const bam_header_t *h0)
-{
- bam_header_t *h;
- int i;
- h = bam_header_init();
- *h = *h0;
- h->hash = h->dict = h->rg2lib = 0;
- h->text = (char*)calloc(h->l_text + 1, 1);
- memcpy(h->text, h0->text, h->l_text);
- h->target_len = (uint32_t*)calloc(h->n_targets, 4);
- h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
- for (i = 0; i < h->n_targets; ++i) {
- h->target_len[i] = h0->target_len[i];
- h->target_name[i] = strdup(h0->target_name[i]);
- }
- return h;
-}
-static void append_header_text(bam_header_t *header, char* text, int len)
-{
- int x = header->l_text + 1;
- int y = header->l_text + len + 1; // 1 byte null
- if (text == 0) return;
- kroundup32(x);
- kroundup32(y);
- if (x < y) header->text = (char*)realloc(header->text, y);
- strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
- header->l_text += len;
- header->text[header->l_text] = 0;
-}
-
-int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
-{
- if (!(fp->type&1) || (fp->type&2)) return -1;
- bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
- return 0;
-}
-
-samfile_t *samopen(const char *fn, const char *mode, const void *aux)
-{
- samfile_t *fp;
- fp = (samfile_t*)calloc(1, sizeof(samfile_t));
- if (strchr(mode, 'r')) { // read
- fp->type |= TYPE_READ;
- if (strchr(mode, 'b')) { // binary
- fp->type |= TYPE_BAM;
- fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
- if (fp->x.bam == 0) goto open_err_ret;
- fp->header = bam_header_read(fp->x.bam);
- } else { // text
- fp->x.tamr = sam_open(fn);
- if (fp->x.tamr == 0) goto open_err_ret;
- fp->header = sam_header_read(fp->x.tamr);
- if (fp->header->n_targets == 0) { // no @SQ fields
- if (aux) { // check if aux is present
- bam_header_t *textheader = fp->header;
- fp->header = sam_header_read2((const char*)aux);
- if (fp->header == 0) goto open_err_ret;
- append_header_text(fp->header, textheader->text, textheader->l_text);
- bam_header_destroy(textheader);
- }
- if (fp->header->n_targets == 0 && bam_verbose >= 1)
- fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
- } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
- }
- } else if (strchr(mode, 'w')) { // write
- fp->header = bam_header_dup((const bam_header_t*)aux);
- if (strchr(mode, 'b')) { // binary
- char bmode[3];
- int i, compress_level = -1;
- for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = 0;
- bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
- fp->type |= TYPE_BAM;
- fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
- if (fp->x.bam == 0) goto open_err_ret;
- bam_header_write(fp->x.bam, fp->header);
- } else { // text
- // open file
- fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
- if (fp->x.tamw == 0) goto open_err_ret;
- if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
- else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
- else fp->type |= BAM_OFDEC<<2;
- // write header
- if (strchr(mode, 'h')) {
- int i;
- bam_header_t *alt;
- // parse the header text
- alt = bam_header_init();
- alt->l_text = fp->header->l_text; alt->text = fp->header->text;
- sam_header_parse(alt);
- alt->l_text = 0; alt->text = 0;
- // check if there are @SQ lines in the header
- fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
- if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
- if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
- fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
- } else { // then dump ->target_{name,len}
- for (i = 0; i < fp->header->n_targets; ++i)
- fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
- }
- bam_header_destroy(alt);
- }
- }
- }
- return fp;
-
-open_err_ret:
- free(fp);
- return 0;
-}
-
-void samclose(samfile_t *fp)
-{
- if (fp == 0) return;
- if (fp->header) bam_header_destroy(fp->header);
- if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
- else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
- else fclose(fp->x.tamw);
- free(fp);
-}
-
-int samread(samfile_t *fp, bam1_t *b)
-{
- if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading
- if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b);
- else return sam_read1(fp->x.tamr, fp->header, b);
-}
-
-int samwrite(samfile_t *fp, const bam1_t *b)
-{
- if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
- if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
- else {
- char *s = bam_format1_core(fp->header, b, fp->type>>2&3);
- int l = strlen(s);
- fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
- free(s);
- return l + 1;
- }
-}
-
-int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
-{
- bam_plbuf_t *buf;
- int ret;
- bam1_t *b;
- b = bam_init1();
- buf = bam_plbuf_init(func, func_data);
- bam_plbuf_set_mask(buf, mask);
- while ((ret = samread(fp, b)) >= 0)
- bam_plbuf_push(b, buf);
- bam_plbuf_push(0, buf);
- bam_plbuf_destroy(buf);
- bam_destroy1(b);
- return 0;
-}
-
-char *samfaipath(const char *fn_ref)
-{
- char *fn_list = 0;
- if (fn_ref == 0) return 0;
- fn_list = calloc(strlen(fn_ref) + 5, 1);
- strcat(strcpy(fn_list, fn_ref), ".fai");
- if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
- if (access(fn_ref, R_OK) == -1) {
- fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
- } else {
- if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
- if (fai_build(fn_ref) == -1) {
- fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
- free(fn_list); fn_list = 0;
- }
- }
- }
- return fn_list;
-}
diff --git a/sam/sam.h b/sam/sam.h
deleted file mode 100644
index 0495501..0000000
--- a/sam/sam.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef BAM_SAM_H
-#define BAM_SAM_H
-
-#include "bam.h"
-
-/*!
- @header
-
- This file provides higher level of I/O routines and unifies the APIs
- for SAM and BAM formats. These APIs are more convenient and
- recommended.
-
- @copyright Genome Research Ltd.
- */
-
-/*! @typedef
- @abstract SAM/BAM file handler
- @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format
- @field bam BAM file handler; valid if (type&1) == 1
- @field tamr SAM file handler for reading; valid if type == 2
- @field tamw SAM file handler for writing; valid if type == 0
- @field header header struct
- */
-typedef struct {
- int type;
- union {
- tamFile tamr;
- bamFile bam;
- FILE *tamw;
- } x;
- bam_header_t *header;
-} samfile_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /*!
- @abstract Open a SAM/BAM file
-
- @param fn SAM/BAM file name; "-" is recognized as stdin (for
- reading) or stdout (for writing).
-
- @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading,
- 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output,
- 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for
- string flag. If 'b' present, it must immediately follow 'r' or
- 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX",
- "rb", "wb" and "wbu" exclusively.
-
- @param aux auxiliary data; if mode[0]=='w', aux points to
- bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM
- are absent, aux points the file name of the list of the reference;
- aux is not used otherwise. If @SQ header lines are present in SAM,
- aux is not used, either.
-
- @return SAM/BAM file handler
- */
- samfile_t *samopen(const char *fn, const char *mode, const void *aux);
-
- /*!
- @abstract Close a SAM/BAM handler
- @param fp file handler to be closed
- */
- void samclose(samfile_t *fp);
-
- /*!
- @abstract Read one alignment
- @param fp file handler
- @param b alignment
- @return bytes read
- */
- int samread(samfile_t *fp, bam1_t *b);
-
- /*!
- @abstract Write one alignment
- @param fp file handler
- @param b alignment
- @return bytes written
- */
- int samwrite(samfile_t *fp, const bam1_t *b);
-
- /*!
- @abstract Get the pileup for a whole alignment file
- @param fp file handler
- @param mask mask transferred to bam_plbuf_set_mask()
- @param func user defined function called in the pileup process
- #param data user provided data for func()
- */
- int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data);
-
- char *samfaipath(const char *fn_ref);
- int samthreads(samfile_t *fp, int n_threads, int n_sub_blks);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/sam_header.c b/sam/sam_header.c
deleted file mode 100644
index 88b6a1c..0000000
--- a/sam/sam_header.c
+++ /dev/null
@@ -1,810 +0,0 @@
-#include "sam_header.h"
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <stdarg.h>
-
-#include "khash.h"
-KHASH_MAP_INIT_STR(str, const char *)
-
-struct _HeaderList
-{
- struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
- struct _HeaderList *next;
- void *data;
-};
-typedef struct _HeaderList list_t;
-typedef list_t HeaderDict;
-
-typedef struct
-{
- char key[2];
- char *value;
-}
-HeaderTag;
-
-typedef struct
-{
- char type[2];
- list_t *tags;
-}
-HeaderLine;
-
-const char *o_hd_tags[] = {"SO","GO",NULL};
-const char *r_hd_tags[] = {"VN",NULL};
-
-const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
-const char *r_sq_tags[] = {"SN","LN",NULL};
-const char *u_sq_tags[] = {"SN",NULL};
-
-const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
-const char *r_rg_tags[] = {"ID",NULL};
-const char *u_rg_tags[] = {"ID",NULL};
-
-const char *o_pg_tags[] = {"VN","CL",NULL};
-const char *r_pg_tags[] = {"ID",NULL};
-
-const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
-const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
-const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
-const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
-
-
-static void debug(const char *format, ...)
-{
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
-}
-
-#if 0
-// Replaced by list_append_to_end
-static list_t *list_prepend(list_t *root, void *data)
-{
- list_t *l = malloc(sizeof(list_t));
- l->next = root;
- l->data = data;
- return l;
-}
-#endif
-
-// Relies on the root->last being correct. Do not use with the other list_*
-// routines unless they are fixed to modify root->last as well.
-static list_t *list_append_to_end(list_t *root, void *data)
-{
- list_t *l = malloc(sizeof(list_t));
- l->last = l;
- l->next = NULL;
- l->data = data;
-
- if ( !root )
- return l;
-
- root->last->next = l;
- root->last = l;
- return root;
-}
-
-static list_t *list_append(list_t *root, void *data)
-{
- list_t *l = root;
- while (l && l->next)
- l = l->next;
- if ( l )
- {
- l->next = malloc(sizeof(list_t));
- l = l->next;
- }
- else
- {
- l = malloc(sizeof(list_t));
- root = l;
- }
- l->data = data;
- l->next = NULL;
- return root;
-}
-
-static void list_free(list_t *root)
-{
- list_t *l = root;
- while (root)
- {
- l = root;
- root = root->next;
- free(l);
- }
-}
-
-
-
-// Look for a tag "XY" in a predefined const char *[] array.
-static int tag_exists(const char *tag, const char **tags)
-{
- int itag=0;
- if ( !tags ) return -1;
- while ( tags[itag] )
- {
- if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
- itag++;
- }
- return -1;
-}
-
-
-
-// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
-// or NULL if everything has been read. The lineptr should be freed by the caller. The
-// newline character is stripped.
-static const char *nextline(char **lineptr, size_t *n, const char *text)
-{
- int len;
- const char *to = text;
-
- if ( !*to ) return NULL;
-
- while ( *to && *to!='\n' && *to!='\r' ) to++;
- len = to - text + 1;
-
- if ( *to )
- {
- // Advance the pointer for the next call
- if ( *to=='\n' ) to++;
- else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
- }
- if ( !len )
- return to;
-
- if ( !*lineptr )
- {
- *lineptr = malloc(len);
- *n = len;
- }
- else if ( *n<len )
- {
- *lineptr = realloc(*lineptr, len);
- *n = len;
- }
- if ( !*lineptr ) {
- debug("[nextline] Insufficient memory!\n");
- return 0;
- }
-
- memcpy(*lineptr,text,len);
- (*lineptr)[len-1] = 0;
-
- return to;
-}
-
-// name points to "XY", value_from points to the first character of the value string and
-// value_to points to the last character of the value string.
-static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
-{
- HeaderTag *tag = malloc(sizeof(HeaderTag));
- int len = value_to-value_from+1;
-
- tag->key[0] = name[0];
- tag->key[1] = name[1];
- tag->value = malloc(len+1);
- memcpy(tag->value,value_from,len+1);
- tag->value[len] = 0;
- return tag;
-}
-
-static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
-{
- list_t *tags = hline->tags;
- while (tags)
- {
- HeaderTag *tag = tags->data;
- if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
- tags = tags->next;
- }
- return NULL;
-}
-
-
-// Return codes:
-// 0 .. different types or unique tags differ or conflicting tags, cannot be merged
-// 1 .. all tags identical -> no need to merge, drop one
-// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
-// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
-static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
-{
- HeaderTag *t1, *t2;
-
- if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
- return 0;
-
- int itype = tag_exists(hline1->type,types);
- if ( itype==-1 ) {
- debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
- return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
- }
-
- if ( unique_tags[itype] )
- {
- t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
- t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
- if ( !t1 || !t2 ) // this should never happen, the unique tags are required
- return 2;
-
- if ( strcmp(t1->value,t2->value) )
- return 0; // the unique tags differ, cannot be merged
- }
- if ( !required_tags[itype] && !optional_tags[itype] )
- {
- t1 = hline1->tags->data;
- t2 = hline2->tags->data;
- if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
- return 0;
- }
-
- int missing=0, itag=0;
- while ( required_tags[itype] && required_tags[itype][itag] )
- {
- t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
- t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
- if ( !t1 && !t2 )
- return 2; // this should never happen
- else if ( !t1 || !t2 )
- missing = 1; // there is some tag missing in one of the hlines
- else if ( strcmp(t1->value,t2->value) )
- {
- if ( unique_tags[itype] )
- return 2; // the lines have a matching unique tag but have a conflicting tag
-
- return 0; // the lines contain conflicting tags, cannot be merged
- }
- itag++;
- }
- itag = 0;
- while ( optional_tags[itype] && optional_tags[itype][itag] )
- {
- t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
- t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
- if ( !t1 && !t2 )
- {
- itag++;
- continue;
- }
- if ( !t1 || !t2 )
- missing = 1; // there is some tag missing in one of the hlines
- else if ( strcmp(t1->value,t2->value) )
- {
- if ( unique_tags[itype] )
- return 2; // the lines have a matching unique tag but have a conflicting tag
-
- return 0; // the lines contain conflicting tags, cannot be merged
- }
- itag++;
- }
- if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
- return 1;
-}
-
-
-static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
-{
- list_t *tags;
- HeaderLine *out = malloc(sizeof(HeaderLine));
- out->type[0] = hline->type[0];
- out->type[1] = hline->type[1];
- out->tags = NULL;
-
- tags = hline->tags;
- while (tags)
- {
- HeaderTag *old = tags->data;
-
- HeaderTag *new = malloc(sizeof(HeaderTag));
- new->key[0] = old->key[0];
- new->key[1] = old->key[1];
- new->value = strdup(old->value);
- out->tags = list_append(out->tags, new);
-
- tags = tags->next;
- }
- return out;
-}
-
-static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
-{
- list_t *tmpl_tags;
-
- if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
- return 0;
-
- tmpl_tags = tmpl_hline->tags;
- while (tmpl_tags)
- {
- HeaderTag *tmpl_tag = tmpl_tags->data;
- HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
- if ( !out_tag )
- {
- HeaderTag *tag = malloc(sizeof(HeaderTag));
- tag->key[0] = tmpl_tag->key[0];
- tag->key[1] = tmpl_tag->key[1];
- tag->value = strdup(tmpl_tag->value);
- out_hline->tags = list_append(out_hline->tags,tag);
- }
- tmpl_tags = tmpl_tags->next;
- }
- return 1;
-}
-
-
-static HeaderLine *sam_header_line_parse(const char *headerLine)
-{
- HeaderLine *hline;
- HeaderTag *tag;
- const char *from, *to;
- from = headerLine;
-
- if ( *from != '@' ) {
- debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
- return 0;
- }
- to = ++from;
-
- while (*to && *to!='\t') to++;
- if ( to-from != 2 ) {
- debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
- return 0;
- }
-
- hline = malloc(sizeof(HeaderLine));
- hline->type[0] = from[0];
- hline->type[1] = from[1];
- hline->tags = NULL;
-
- int itype = tag_exists(hline->type, types);
-
- from = to;
- while (*to && *to=='\t') to++;
- if ( to-from != 1 ) {
- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
- free(hline);
- return 0;
- }
- from = to;
- while (*from)
- {
- while (*to && *to!='\t') to++;
-
- if ( !required_tags[itype] && !optional_tags[itype] )
- {
- // CO is a special case, it can contain anything, including tabs
- if ( *to ) { to++; continue; }
- tag = new_tag(" ",from,to-1);
- }
- else
- tag = new_tag(from,from+3,to-1);
-
- if ( header_line_has_tag(hline,tag->key) )
- debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
- hline->tags = list_append(hline->tags, tag);
-
- from = to;
- while (*to && *to=='\t') to++;
- if ( *to && to-from != 1 ) {
- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
- return 0;
- }
-
- from = to;
- }
- return hline;
-}
-
-
-// Must be of an existing type, all tags must be recognised and all required tags must be present
-static int sam_header_line_validate(HeaderLine *hline)
-{
- list_t *tags;
- HeaderTag *tag;
- int itype, itag;
-
- // Is the type correct?
- itype = tag_exists(hline->type, types);
- if ( itype==-1 )
- {
- debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
- return 0;
- }
-
- // Has all required tags?
- itag = 0;
- while ( required_tags[itype] && required_tags[itype][itag] )
- {
- if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
- {
- debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
- hline->type[0],hline->type[1]);
- return 0;
- }
- itag++;
- }
-
- // Are all tags recognised?
- tags = hline->tags;
- while ( tags )
- {
- tag = tags->data;
- if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
- {
- // Lower case tags are user-defined values.
- if( !(islower(tag->key[0]) || islower(tag->key[1])) )
- {
- // Neither is lower case, but tag was not recognized.
- debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
- // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
- }
- // else - allow user defined tag
- }
- tags = tags->next;
- }
-
- return 1;
-}
-
-
-static void print_header_line(FILE *fp, HeaderLine *hline)
-{
- list_t *tags = hline->tags;
- HeaderTag *tag;
-
- fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
- while (tags)
- {
- tag = tags->data;
-
- fprintf(fp, "\t");
- if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
- fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
- fprintf(fp, "%s", tag->value);
-
- tags = tags->next;
- }
- fprintf(fp,"\n");
-}
-
-
-static void sam_header_line_free(HeaderLine *hline)
-{
- list_t *tags = hline->tags;
- while (tags)
- {
- HeaderTag *tag = tags->data;
- free(tag->value);
- free(tag);
- tags = tags->next;
- }
- list_free(hline->tags);
- free(hline);
-}
-
-void sam_header_free(void *_header)
-{
- HeaderDict *header = (HeaderDict*)_header;
- list_t *hlines = header;
- while (hlines)
- {
- sam_header_line_free(hlines->data);
- hlines = hlines->next;
- }
- list_free(header);
-}
-
-HeaderDict *sam_header_clone(const HeaderDict *dict)
-{
- HeaderDict *out = NULL;
- while (dict)
- {
- HeaderLine *hline = dict->data;
- out = list_append(out, sam_header_line_clone(hline));
- dict = dict->next;
- }
- return out;
-}
-
-// Returns a newly allocated string
-char *sam_header_write(const void *_header)
-{
- const HeaderDict *header = (const HeaderDict*)_header;
- char *out = NULL;
- int len=0, nout=0;
- const list_t *hlines;
-
- // Calculate the length of the string to allocate
- hlines = header;
- while (hlines)
- {
- len += 4; // @XY and \n
-
- HeaderLine *hline = hlines->data;
- list_t *tags = hline->tags;
- while (tags)
- {
- HeaderTag *tag = tags->data;
- len += strlen(tag->value) + 1; // \t
- if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
- len += strlen(tag->value) + 3; // XY:
- tags = tags->next;
- }
- hlines = hlines->next;
- }
-
- nout = 0;
- out = malloc(len+1);
- hlines = header;
- while (hlines)
- {
- HeaderLine *hline = hlines->data;
-
- nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
-
- list_t *tags = hline->tags;
- while (tags)
- {
- HeaderTag *tag = tags->data;
- nout += sprintf(out+nout,"\t");
- if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
- nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
- nout += sprintf(out+nout,"%s", tag->value);
- tags = tags->next;
- }
- hlines = hlines->next;
- nout += sprintf(out+nout,"\n");
- }
- out[len] = 0;
- return out;
-}
-
-void *sam_header_parse2(const char *headerText)
-{
- list_t *hlines = NULL;
- HeaderLine *hline;
- const char *text;
- char *buf=NULL;
- size_t nbuf = 0;
- int tovalidate = 0;
-
- if ( !headerText )
- return 0;
-
- text = headerText;
- while ( (text=nextline(&buf, &nbuf, text)) )
- {
- hline = sam_header_line_parse(buf);
- if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
- // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
- hlines = list_append_to_end(hlines, hline);
- else
- {
- if (hline) sam_header_line_free(hline);
- sam_header_free(hlines);
- if ( buf ) free(buf);
- return NULL;
- }
- }
- if ( buf ) free(buf);
-
- return hlines;
-}
-
-void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
-{
- const HeaderDict *dict = (const HeaderDict*)_dict;
- const list_t *l = dict;
- khash_t(str) *tbl = kh_init(str);
- khiter_t k;
- int ret;
-
- if (_dict == 0) return tbl; // return an empty (not null) hash table
- while (l)
- {
- HeaderLine *hline = l->data;
- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
- {
- l = l->next;
- continue;
- }
-
- HeaderTag *key, *value;
- key = header_line_has_tag(hline,key_tag);
- value = header_line_has_tag(hline,value_tag);
- if ( !key || !value )
- {
- l = l->next;
- continue;
- }
-
- k = kh_get(str, tbl, key->value);
- if ( k != kh_end(tbl) )
- debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
- k = kh_put(str, tbl, key->value, &ret);
- kh_value(tbl, k) = value->value;
-
- l = l->next;
- }
- return tbl;
-}
-
-char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
-{
- const HeaderDict *dict = (const HeaderDict*)_dict;
- const list_t *l = dict;
- int max, n;
- char **ret;
-
- ret = 0; *_n = max = n = 0;
- while (l)
- {
- HeaderLine *hline = l->data;
- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
- {
- l = l->next;
- continue;
- }
-
- HeaderTag *key;
- key = header_line_has_tag(hline,key_tag);
- if ( !key )
- {
- l = l->next;
- continue;
- }
-
- if (n == max) {
- max = max? max<<1 : 4;
- ret = realloc(ret, max * sizeof(void*));
- }
- ret[n++] = key->value;
-
- l = l->next;
- }
- *_n = n;
- return ret;
-}
-
-void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
-{
- list_t *l = iter;
- if ( !l ) return NULL;
-
- while (l)
- {
- HeaderLine *hline = l->data;
- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
- {
- l = l->next;
- continue;
- }
-
- HeaderTag *key, *value;
- key = header_line_has_tag(hline,key_tag);
- value = header_line_has_tag(hline,value_tag);
- if ( !key && !value )
- {
- l = l->next;
- continue;
- }
-
- *_key = key->value;
- *_value = value->value;
- return l->next;
- }
- return l;
-}
-
-const char *sam_tbl_get(void *h, const char *key)
-{
- khash_t(str) *tbl = (khash_t(str)*)h;
- khint_t k;
- k = kh_get(str, tbl, key);
- return k == kh_end(tbl)? 0 : kh_val(tbl, k);
-}
-
-int sam_tbl_size(void *h)
-{
- khash_t(str) *tbl = (khash_t(str)*)h;
- return h? kh_size(tbl) : 0;
-}
-
-void sam_tbl_destroy(void *h)
-{
- khash_t(str) *tbl = (khash_t(str)*)h;
- kh_destroy(str, tbl);
-}
-
-void *sam_header_merge(int n, const void **_dicts)
-{
- const HeaderDict **dicts = (const HeaderDict**)_dicts;
- HeaderDict *out_dict;
- int idict, status;
-
- if ( n<2 ) return NULL;
-
- out_dict = sam_header_clone(dicts[0]);
-
- for (idict=1; idict<n; idict++)
- {
- const list_t *tmpl_hlines = dicts[idict];
-
- while ( tmpl_hlines )
- {
- list_t *out_hlines = out_dict;
- int inserted = 0;
- while ( out_hlines )
- {
- status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
- if ( status==0 )
- {
- out_hlines = out_hlines->next;
- continue;
- }
-
- if ( status==2 )
- {
- print_header_line(stderr,tmpl_hlines->data);
- print_header_line(stderr,out_hlines->data);
- debug("Conflicting lines, cannot merge the headers.\n");
- return 0;
- }
- if ( status==3 )
- sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
-
- inserted = 1;
- break;
- }
- if ( !inserted )
- out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
-
- tmpl_hlines = tmpl_hlines->next;
- }
- }
-
- return out_dict;
-}
-
-char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n)
-{
- int nout = 0;
- char **out = NULL;
-
- *n = 0;
- list_t *l = (list_t *)dict;
- if ( !l ) return NULL;
-
- int i, ntags = 0;
- while ( tags[ntags] ) ntags++;
-
- while (l)
- {
- HeaderLine *hline = l->data;
- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
- {
- l = l->next;
- continue;
- }
- out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags);
- for (i=0; i<ntags; i++)
- {
- HeaderTag *key = header_line_has_tag(hline, tags[i]);
- if ( !key )
- {
- out[nout*ntags+i] = NULL;
- continue;
- }
- out[nout*ntags+i] = key->value;
- }
- nout++;
- l = l->next;
- }
- *n = nout;
- return out;
-}
-
diff --git a/sam/sam_header.h b/sam/sam_header.h
deleted file mode 100644
index 4b0cb03..0000000
--- a/sam/sam_header.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef __SAM_HEADER_H__
-#define __SAM_HEADER_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- void *sam_header_parse2(const char *headerText);
- void *sam_header_merge(int n, const void **dicts);
- void sam_header_free(void *header);
- char *sam_header_write(const void *headerDict); // returns a newly allocated string
-
- /*
- // Usage example
- const char *key, *val;
- void *iter = sam_header_parse2(bam->header->text);
- while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val);
- */
- void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value);
- char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n);
-
- /*
- // Usage example
- int i, j, n;
- const char *tags[] = {"SN","LN","UR","M5",NULL};
- void *dict = sam_header_parse2(bam->header->text);
- char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n);
- for (i=0; i<n; i++)
- {
- for (j=0; j<4; j++)
- if ( tbl[4*i+j] ) printf("\t%s", tbl[4*i+j]);
- else printf("-");
- printf("\n");
- }
- if (tbl) free(tbl);
- */
- char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n);
-
- void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]);
- const char *sam_tbl_get(void *h, const char *key);
- int sam_tbl_size(void *h);
- void sam_tbl_destroy(void *h);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/sam/sam_view.c b/sam/sam_view.c
deleted file mode 100644
index 7f3fdab..0000000
--- a/sam/sam_view.c
+++ /dev/null
@@ -1,441 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <math.h>
-#include <inttypes.h>
-#include "sam_header.h"
-#include "sam.h"
-#include "faidx.h"
-#include "kstring.h"
-#include "khash.h"
-KHASH_SET_INIT_STR(rg)
-
-// When counting records instead of printing them,
-// data passed to the bam_fetch callback is encapsulated in this struct.
-typedef struct {
- bam_header_t *header;
- int64_t *count; // int does overflow for very big BAMs
-} count_func_data_t;
-
-typedef khash_t(rg) *rghash_t;
-
-// FIXME: we'd better use no global variables...
-static rghash_t g_rghash = 0;
-static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0, g_min_qlen = 0;
-static uint32_t g_subsam_seed = 0;
-static double g_subsam_frac = -1.;
-static char *g_library, *g_rg;
-static void *g_bed;
-
-void *bed_read(const char *fn);
-void bed_destroy(void *_h);
-int bed_overlap(const void *_h, const char *chr, int beg, int end);
-
-static int process_aln(const bam_header_t *h, bam1_t *b)
-{
- if (g_qual_scale > 1) {
- int i;
- uint8_t *qual = bam1_qual(b);
- for (i = 0; i < b->core.l_qseq; ++i) {
- int c = qual[i] * g_qual_scale;
- qual[i] = c < 93? c : 93;
- }
- }
- if (g_min_qlen > 0) {
- int k, qlen = 0;
- uint32_t *cigar = bam1_cigar(b);
- for (k = 0; k < b->core.n_cigar; ++k)
- if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP)
- qlen += bam_cigar_oplen(cigar[k]);
- if (qlen < g_min_qlen) return 1;
- }
- if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
- return 1;
- if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))))
- return 1;
- if (g_subsam_frac > 0.) {
- uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + g_subsam_seed;
- if ((double)(k&0xffffff) / 0x1000000 >= g_subsam_frac) return 1;
- }
- if (g_rg || g_rghash) {
- uint8_t *s = bam_aux_get(b, "RG");
- if (s) {
- if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1;
- if (g_rghash) {
- khint_t k = kh_get(rg, g_rghash, (char*)(s + 1));
- return (k != kh_end(g_rghash))? 0 : 1;
- }
- }
- }
- if (g_library) {
- const char *p = bam_get_library((bam_header_t*)h, b);
- return (p && strcmp(p, g_library) == 0)? 0 : 1;
- }
- return 0;
-}
-
-static char *drop_rg(char *hdtxt, rghash_t h, int *len)
-{
- char *p = hdtxt, *q, *r, *s;
- kstring_t str;
- memset(&str, 0, sizeof(kstring_t));
- while (1) {
- int toprint = 0;
- q = strchr(p, '\n');
- if (q == 0) q = p + strlen(p);
- if (q - p < 3) break; // the line is too short; then stop
- if (strncmp(p, "@RG\t", 4) == 0) {
- int c;
- khint_t k;
- if ((r = strstr(p, "\tID:")) != 0) {
- r += 4;
- for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s);
- c = *s; *s = '\0';
- k = kh_get(rg, h, r);
- *s = c;
- if (k != kh_end(h)) toprint = 1;
- }
- } else toprint = 1;
- if (toprint) {
- kputsn(p, q - p, &str); kputc('\n', &str);
- }
- p = q + 1;
- }
- *len = str.l;
- return str.s;
-}
-
-// callback function for bam_fetch() that prints nonskipped records
-static int view_func(const bam1_t *b, void *data)
-{
- if (!process_aln(((samfile_t*)data)->header, (bam1_t*)b))
- samwrite((samfile_t*)data, b);
- return 0;
-}
-
-// callback function for bam_fetch() that counts nonskipped records
-static int count_func(const bam1_t *b, void *data)
-{
- if (!process_aln(((count_func_data_t*)data)->header, (bam1_t*)b)) {
- (*((count_func_data_t*)data)->count)++;
- }
- return 0;
-}
-
-static int usage(int is_long_help);
-
-int main_samview(int argc, char *argv[])
-{
- int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0;
- int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0;
- int64_t count = 0;
- samfile_t *in = 0, *out = 0;
- char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q;
-
- /* parse command-line options */
- strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:@:m:")) >= 0) {
- switch (c) {
- case 's':
- if ((g_subsam_seed = strtol(optarg, &q, 10)) != 0) {
- srand(g_subsam_seed);
- g_subsam_seed = rand();
- }
- g_subsam_frac = strtod(q, &q);
- break;
- case 'm': g_min_qlen = atoi(optarg); break;
- case 'c': is_count = 1; break;
- case 'S': is_bamin = 0; break;
- case 'b': is_bamout = 1; break;
- case 't': fn_list = strdup(optarg); is_bamin = 0; break;
- case 'h': is_header = 1; break;
- case 'H': is_header_only = 1; break;
- case 'o': fn_out = strdup(optarg); break;
- case 'f': g_flag_on = strtol(optarg, 0, 0); break;
- case 'F': g_flag_off = strtol(optarg, 0, 0); break;
- case 'q': g_min_mapQ = atoi(optarg); break;
- case 'u': compress_level = 0; break;
- case '1': compress_level = 1; break;
- case 'l': g_library = strdup(optarg); break;
- case 'L': g_bed = bed_read(optarg); break;
- case 'r': g_rg = strdup(optarg); break;
- case 'R': fn_rg = strdup(optarg); break;
- case 'x': of_type = BAM_OFHEX; break;
- case 'X': of_type = BAM_OFSTR; break;
- case '?': is_long_help = 1; break;
- case 'T': fn_ref = strdup(optarg); is_bamin = 0; break;
- case 'B': bam_no_B = 1; break;
- case 'Q': g_qual_scale = atoi(optarg); break;
- case '@': n_threads = strtol(optarg, 0, 0); break;
- default: return usage(is_long_help);
- }
- }
- if (compress_level >= 0) is_bamout = 1;
- if (is_header_only) is_header = 1;
- if (is_bamout) strcat(out_mode, "b");
- else {
- if (of_type == BAM_OFHEX) strcat(out_mode, "x");
- else if (of_type == BAM_OFSTR) strcat(out_mode, "X");
- }
- if (is_bamin) strcat(in_mode, "b");
- if (is_header) strcat(out_mode, "h");
- if (compress_level >= 0) {
- char tmp[2];
- tmp[0] = compress_level + '0'; tmp[1] = '\0';
- strcat(out_mode, tmp);
- }
- if (argc == optind) return usage(is_long_help); // potential memory leak...
-
- // read the list of read groups
- if (fn_rg) {
- FILE *fp_rg;
- char buf[1024];
- int ret;
- g_rghash = kh_init(rg);
- fp_rg = fopen(fn_rg, "r");
- while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me...
- kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates...
- fclose(fp_rg);
- }
-
- // generate the fn_list if necessary
- if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
- // open file handlers
- if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
- fprintf(stderr, "[main_samview] fail to open \"%s\" for reading.\n", argv[optind]);
- ret = 1;
- goto view_end;
- }
- if (in->header == 0) {
- fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]);
- ret = 1;
- goto view_end;
- }
- if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for...
- char *tmp;
- int l;
- tmp = drop_rg(in->header->text, g_rghash, &l);
- free(in->header->text);
- in->header->text = tmp;
- in->header->l_text = l;
- }
- if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
- fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
- ret = 1;
- goto view_end;
- }
- if (n_threads > 1) samthreads(out, n_threads, 256);
- if (is_header_only) goto view_end; // no need to print alignments
-
- if (argc == optind + 1) { // convert/print the entire file
- bam1_t *b = bam_init1();
- int r;
- while ((r = samread(in, b)) >= 0) { // read one alignment from `in'
- if (!process_aln(in->header, b)) {
- if (!is_count) samwrite(out, b); // write the alignment to `out'
- count++;
- }
- }
- if (r < -1) {
- fprintf(stderr, "[main_samview] truncated file.\n");
- ret = 1;
- }
- bam_destroy1(b);
- } else { // retrieve alignments in specified regions
- int i;
- bam_index_t *idx = 0;
- if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index
- if (idx == 0) { // index is unavailable
- fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n");
- ret = 1;
- goto view_end;
- }
- for (i = optind + 1; i < argc; ++i) {
- int tid, beg, end, result;
- bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200'
- if (tid < 0) { // reference name is not found
- fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
- continue;
- }
- // fetch alignments
- if (is_count) {
- count_func_data_t count_data = { in->header, &count };
- result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func);
- } else
- result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func);
- if (result < 0) {
- fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
- ret = 1;
- break;
- }
- }
- bam_index_destroy(idx); // destroy the BAM index
- }
-
-view_end:
- if (is_count && ret == 0)
- printf("%" PRId64 "\n", count);
-
- // close files, free and return
- free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);
- if (g_bed) bed_destroy(g_bed);
- if (g_rghash) {
- khint_t k;
- for (k = 0; k < kh_end(g_rghash); ++k)
- if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k));
- kh_destroy(rg, g_rghash);
- }
- samclose(in);
- if (!is_count)
- samclose(out);
- return ret;
-}
-
-static int usage(int is_long_help)
-{
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]\n\n");
- fprintf(stderr, "Options: -b output BAM\n");
- fprintf(stderr, " -h print header for the SAM output\n");
- fprintf(stderr, " -H print header only (no alignments)\n");
- fprintf(stderr, " -S input is SAM\n");
- fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
- fprintf(stderr, " -1 fast compression (force -b)\n");
- fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
- fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
- fprintf(stderr, " -c print only the count of matching records\n");
- fprintf(stderr, " -B collapse the backward CIGAR operation\n");
- fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
- fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n");
- fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
- fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n");
- fprintf(stderr, " -o FILE output file name [stdout]\n");
- fprintf(stderr, " -R FILE list of read groups to be outputted [null]\n");
- fprintf(stderr, " -f INT required flag, 0 for unset [0]\n");
- fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n");
- fprintf(stderr, " -q INT minimum mapping quality [0]\n");
- fprintf(stderr, " -l STR only output reads in library STR [null]\n");
- fprintf(stderr, " -r STR only output reads in read group STR [null]\n");
- fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n");
- fprintf(stderr, " -? longer help\n");
- fprintf(stderr, "\n");
- if (is_long_help)
- fprintf(stderr, "Notes:\n\
-\n\
- 1. By default, this command assumes the file on the command line is in\n\
- the BAM format and it prints the alignments in SAM. If `-t' is\n\
- applied, the input file is assumed to be in the SAM format. The\n\
- file supplied with `-t' is SPACE/TAB delimited with the first two\n\
- fields of each line consisting of the reference name and the\n\
- corresponding sequence length. The `.fai' file generated by `faidx'\n\
- can be used here. This file may be empty if reads are unaligned.\n\
-\n\
- 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\
-\n\
- 3. BAM->SAM conversion: `samtools view in.bam'.\n\
-\n\
- 4. A region should be presented in one of the following formats:\n\
- `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\
- specified, the input alignment file must be an indexed BAM file.\n\
-\n\
- 5. Option `-u' is preferred over `-b' when the output is piped to\n\
- another samtools command.\n\
-\n\
- 6. In a string FLAG, each character represents one bit with\n\
- p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\
- U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\
- 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\
- f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\
- `-X' are samtools-C specific. Picard and older samtools do not\n\
- support HEX or string flags.\n\
-\n");
- return 1;
-}
-
-int main_import(int argc, char *argv[])
-{
- int argc2, ret;
- char **argv2;
- if (argc != 4) {
- fprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\n");
- return 1;
- }
- argc2 = 6;
- argv2 = calloc(6, sizeof(char*));
- argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
- ret = main_samview(argc2, argv2);
- free(argv2);
- return ret;
-}
-
-int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 };
-
-int main_bam2fq(int argc, char *argv[])
-{
- bamFile fp;
- bam_header_t *h;
- bam1_t *b;
- int8_t *buf;
- int max_buf, c, no12 = 0;
- while ((c = getopt(argc, argv, "n")) > 0)
- if (c == 'n') no12 = 1;
- if (argc == 1) {
- fprintf(stderr, "Usage: samtools bam2fq <in.bam>\n");
- return 1;
- }
- fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
- if (fp == 0) return 1;
- h = bam_header_read(fp);
- b = bam_init1();
- buf = 0;
- max_buf = 0;
- while (bam_read1(fp, b) >= 0) {
- int i, qlen = b->core.l_qseq;
- uint8_t *seq;
- putchar('@'); fputs(bam1_qname(b), stdout);
- if (no12) putchar('\n');
- else {
- if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1");
- else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2");
- else putchar('\n');
- }
- if (max_buf < qlen + 1) {
- max_buf = qlen + 1;
- kroundup32(max_buf);
- buf = realloc(buf, max_buf);
- }
- buf[qlen] = 0;
- seq = bam1_seq(b);
- for (i = 0; i < qlen; ++i)
- buf[i] = bam1_seqi(seq, i);
- if (b->core.flag & 16) { // reverse complement
- for (i = 0; i < qlen>>1; ++i) {
- int8_t t = seq_comp_table[buf[qlen - 1 - i]];
- buf[qlen - 1 - i] = seq_comp_table[buf[i]];
- buf[i] = t;
- }
- if (qlen&1) buf[i] = seq_comp_table[buf[i]];
- }
- for (i = 0; i < qlen; ++i)
- buf[i] = bam_nt16_rev_table[buf[i]];
- puts((char*)buf);
- puts("+");
- seq = bam1_qual(b);
- for (i = 0; i < qlen; ++i)
- buf[i] = 33 + seq[i];
- if (b->core.flag & 16) { // reverse
- for (i = 0; i < qlen>>1; ++i) {
- int8_t t = buf[qlen - 1 - i];
- buf[qlen - 1 - i] = buf[i];
- buf[i] = t;
- }
- }
- puts((char*)buf);
- }
- free(buf);
- bam_destroy1(b);
- bam_header_destroy(h);
- bam_close(fp);
- return 0;
-}
diff --git a/sam/sample.c b/sam/sample.c
deleted file mode 100644
index 830b9d1..0000000
--- a/sam/sample.c
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include "sample.h"
-#include "khash.h"
-KHASH_MAP_INIT_STR(sm, int)
-
-bam_sample_t *bam_smpl_init(void)
-{
- bam_sample_t *s;
- s = calloc(1, sizeof(bam_sample_t));
- s->rg2smid = kh_init(sm);
- s->sm2id = kh_init(sm);
- return s;
-}
-
-void bam_smpl_destroy(bam_sample_t *sm)
-{
- int i;
- khint_t k;
- khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
- if (sm == 0) return;
- for (i = 0; i < sm->n; ++i) free(sm->smpl[i]);
- free(sm->smpl);
- for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k)
- if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k));
- kh_destroy(sm, sm->rg2smid);
- kh_destroy(sm, sm->sm2id);
- free(sm);
-}
-
-static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val)
-{
- khint_t k_rg, k_sm;
- int ret;
- khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
- k_rg = kh_get(sm, rg2smid, key);
- if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID
- k_rg = kh_put(sm, rg2smid, strdup(key), &ret);
- k_sm = kh_get(sm, sm2id, val);
- if (k_sm == kh_end(sm2id)) { // absent
- if (sm->n == sm->m) {
- sm->m = sm->m? sm->m<<1 : 1;
- sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m);
- }
- sm->smpl[sm->n] = strdup(val);
- k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret);
- kh_val(sm2id, k_sm) = sm->n++;
- }
- kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm);
-}
-
-int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
-{
- const char *p = txt, *q, *r;
- kstring_t buf, first_sm;
- int n = 0;
- khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
- if (txt == 0) {
- add_pair(sm, sm2id, fn, fn);
- return 0;
- }
- memset(&buf, 0, sizeof(kstring_t));
- memset(&first_sm, 0, sizeof(kstring_t));
- while ((q = strstr(p, "@RG")) != 0) {
- p = q + 3;
- r = q = 0;
- if ((q = strstr(p, "\tID:")) != 0) q += 4;
- if ((r = strstr(p, "\tSM:")) != 0) r += 4;
- if (r && q) {
- char *u, *v;
- int oq, or;
- for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
- for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
- oq = *u; or = *v; *u = *v = '\0';
- buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
- add_pair(sm, sm2id, buf.s, r);
- if ( !first_sm.s )
- kputs(r,&first_sm);
- *u = oq; *v = or;
- } else break;
- p = q > r? q : r;
- ++n;
- }
- if (n == 0) add_pair(sm, sm2id, fn, fn);
- // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
- // use the tag instead.
- else if ( n==1 && first_sm.s )
- add_pair(sm,sm2id,fn,first_sm.s);
- if ( first_sm.s )
- free(first_sm.s);
-
-// add_pair(sm, sm2id, fn, fn);
- free(buf.s);
- return 0;
-}
-
-int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str)
-{
- khint_t k;
- khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
- if (rg) {
- str->l = 0;
- kputs(fn, str); kputc('/', str); kputs(rg, str);
- k = kh_get(sm, rg2smid, str->s);
- } else k = kh_get(sm, rg2smid, fn);
- return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k);
-}
diff --git a/sam/sample.h b/sam/sample.h
deleted file mode 100644
index 85fe499..0000000
--- a/sam/sample.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef BAM_SAMPLE_H
-#define BAM_SAMPLE_H
-
-#include "kstring.h"
-
-typedef struct {
- int n, m;
- char **smpl;
- void *rg2smid, *sm2id;
-} bam_sample_t;
-
-bam_sample_t *bam_smpl_init(void);
-int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt);
-int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str);
-void bam_smpl_destroy(bam_sample_t *sm);
-
-#endif
diff --git a/sam/samtools.1 b/sam/samtools.1
deleted file mode 100644
index 5923abd..0000000
--- a/sam/samtools.1
+++ /dev/null
@@ -1,1066 +0,0 @@
-.TH samtools 1 "15 March 2013" "samtools-0.1.19" "Bioinformatics tools"
-.SH NAME
-.PP
-samtools - Utilities for the Sequence Alignment/Map (SAM) format
-
-bcftools - Utilities for the Binary Call Format (BCF) and VCF
-.SH SYNOPSIS
-.PP
-samtools view -bt ref_list.txt -o aln.bam aln.sam.gz
-.PP
-samtools sort aln.bam aln.sorted
-.PP
-samtools index aln.sorted.bam
-.PP
-samtools idxstats aln.sorted.bam
-.PP
-samtools view aln.sorted.bam chr2:20,100,000-20,200,000
-.PP
-samtools merge out.bam in1.bam in2.bam in3.bam
-.PP
-samtools faidx ref.fasta
-.PP
-samtools pileup -vcf ref.fasta aln.sorted.bam
-.PP
-samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
-.PP
-samtools tview aln.sorted.bam ref.fasta
-.PP
-bcftools index in.bcf
-.PP
-bcftools view in.bcf chr2:100-200 > out.vcf
-.PP
-bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs
-
-.SH DESCRIPTION
-.PP
-Samtools is a set of utilities that manipulate alignments in the BAM
-format. It imports from and exports to the SAM (Sequence Alignment/Map)
-format, does sorting, merging and indexing, and allows to retrieve reads
-in any regions swiftly.
-
-Samtools is designed to work on a stream. It regards an input file `-'
-as the standard input (stdin) and an output file `-' as the standard
-output (stdout). Several commands can thus be combined with Unix
-pipes. Samtools always output warning and error messages to the standard
-error output (stderr).
-
-Samtools is also able to open a BAM (not SAM) file on a remote FTP or
-HTTP server if the BAM file name starts with `ftp://' or `http://'.
-Samtools checks the current working directory for the index file and
-will download the index upon absence. Samtools does not retrieve the
-entire alignment file unless it is asked to do so.
-
-.SH SAMTOOLS COMMANDS AND OPTIONS
-
-.TP 10
-.B view
-samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F
-skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] <in.bam>|<in.sam> [region1 [...]]
-
-Extract/print all or sub alignments in SAM or BAM format. If no region
-is specified, all the alignments will be printed; otherwise only
-alignments overlapping the specified regions will be output. An
-alignment may be given multiple times if it is overlapping several
-regions. A region can be presented, for example, in the following
-format: `chr2' (the whole chr2), `chr2:1000000' (region starting from
-1,000,000bp) or `chr2:1,000,000-2,000,000' (region between 1,000,000 and
-2,000,000bp including the end points). The coordinate is 1-based.
-
-.B OPTIONS:
-.RS
-.TP 10
-.B -b
-Output in the BAM format.
-.TP
-.BI -f \ INT
-Only output alignments with all bits in INT present in the FLAG
-field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
-.TP
-.BI -F \ INT
-Skip alignments with bits present in INT [0]
-.TP
-.B -h
-Include the header in the output.
-.TP
-.B -H
-Output the header only.
-.TP
-.BI -l \ STR
-Only output reads in library STR [null]
-.TP
-.BI -o \ FILE
-Output file [stdout]
-.TP
-.BI -q \ INT
-Skip alignments with MAPQ smaller than INT [0]
-.TP
-.BI -r \ STR
-Only output reads in read group STR [null]
-.TP
-.BI -R \ FILE
-Output reads in read groups listed in
-.I FILE
-[null]
-.TP
-.BI -s \ FLOAT
-Fraction of templates/pairs to subsample; the integer part is treated as the
-seed for the random number generator [-1]
-.TP
-.B -S
-Input is in SAM. If @SQ header lines are absent, the
-.B `-t'
-option is required.
-.TP
-.B -c
-Instead of printing the alignments, only count them and print the
-total number. All filter options, such as
-.B `-f',
-.B `-F'
-and
-.B `-q'
-, are taken into account.
-.TP
-.BI -t \ FILE
-This file is TAB-delimited. Each line must contain the reference name
-and the length of the reference, one line for each distinct reference;
-additional fields are ignored. This file also defines the order of the
-reference sequences in sorting. If you run `samtools faidx <ref.fa>',
-the resultant index file
-.I <ref.fa>.fai
-can be used as this
-.I <in.ref_list>
-file.
-.TP
-.B -u
-Output uncompressed BAM. This option saves time spent on
-compression/decomprssion and is thus preferred when the output is piped
-to another samtools command.
-.RE
-
-.TP
-.B tview
-samtools tview
-.RB [ \-p
-.IR chr:pos ]
-.RB [ \-s
-.IR STR ]
-.RB [ \-d
-.IR display ]
-.RI <in.sorted.bam>
-.RI [ref.fasta]
-
-Text alignment viewer (based on the ncurses library). In the viewer,
-press `?' for help and press `g' to check the alignment start from a
-region in the format like `chr10:10,000,000' or `=10,000,000' when
-viewing the same reference sequence.
-
-.B Options:
-.RS
-.TP 14
-.BI -d \ display
-Output as (H)tml or (C)urses or (T)ext
-.TP
-.BI -p \ chr:pos
-Go directly to this position
-.TP
-.BI -s \ STR
-Display only reads from this sample or read group
-.RE
-
-.TP
-.B mpileup
-samtools mpileup
-.RB [ \-EBugp ]
-.RB [ \-C
-.IR capQcoef ]
-.RB [ \-r
-.IR reg ]
-.RB [ \-f
-.IR in.fa ]
-.RB [ \-l
-.IR list ]
-.RB [ \-M
-.IR capMapQ ]
-.RB [ \-Q
-.IR minBaseQ ]
-.RB [ \-q
-.IR minMapQ ]
-.I in.bam
-.RI [ in2.bam
-.RI [ ... ]]
-
-Generate BCF or pileup for one or multiple BAM files. Alignment records
-are grouped by sample identifiers in @RG header lines. If sample
-identifiers are absent, each input file is regarded as one sample.
-
-In the pileup format (without
-.BR -u or -g ),
-each
-line represents a genomic position, consisting of chromosome name,
-coordinate, reference base, read bases, read qualities and alignment
-mapping qualities. Information on match, mismatch, indel, strand,
-mapping quality and start and end of a read are all encoded at the read
-base column. At this column, a dot stands for a match to the reference
-base on the forward strand, a comma for a match on the reverse strand,
-a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward
-strand and `acgtn' for a mismatch on the reverse strand. A pattern
-`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this
-reference position and the next reference position. The length of the
-insertion is given by the integer in the pattern, followed by the
-inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
-represents a deletion from the reference. The deleted bases will be
-presented as `*' in the following lines. Also at the read base column, a
-symbol `^' marks the start of a read. The ASCII of the character
-following `^' minus 33 gives the mapping quality. A symbol `$' marks the
-end of a read segment.
-
-.B Input Options:
-.RS
-.TP 10
-.B -6
-Assume the quality is in the Illumina 1.3+ encoding.
-.B -A
-Do not skip anomalous read pairs in variant calling.
-.TP
-.B -B
-Disable probabilistic realignment for the computation of base alignment
-quality (BAQ). BAQ is the Phred-scaled probability of a read base being
-misaligned. Applying this option greatly helps to reduce false SNPs
-caused by misalignments.
-.TP
-.BI -b \ FILE
-List of input BAM files, one file per line [null]
-.TP
-.BI -C \ INT
-Coefficient for downgrading mapping quality for reads containing
-excessive mismatches. Given a read with a phred-scaled probability q of
-being generated from the mapped position, the new mapping quality is
-about sqrt((INT-q)/INT)*INT. A zero value disables this
-functionality; if enabled, the recommended value for BWA is 50. [0]
-.TP
-.BI -d \ INT
-At a position, read maximally
-.I INT
-reads per input BAM. [250]
-.TP
-.B -E
-Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt
-specificity a little bit.
-.TP
-.BI -f \ FILE
-The
-.BR faidx -indexed
-reference file in the FASTA format. The file can be optionally compressed by
-.BR razip .
-[null]
-.TP
-.BI -l \ FILE
-BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null]
-.TP
-.BI -q \ INT
-Minimum mapping quality for an alignment to be used [0]
-.TP
-.BI -Q \ INT
-Minimum base quality for a base to be considered [13]
-.TP
-.BI -r \ STR
-Only generate pileup in region
-.I STR
-[all sites]
-.TP
-.B Output Options:
-
-.TP
-.B -D
-Output per-sample read depth
-.TP
-.B -g
-Compute genotype likelihoods and output them in the binary call format (BCF).
-.TP
-.B -S
-Output per-sample Phred-scaled strand bias P-value
-.TP
-.B -u
-Similar to
-.B -g
-except that the output is uncompressed BCF, which is preferred for piping.
-
-.TP
-.B Options for Genotype Likelihood Computation (for -g or -u):
-
-.TP
-.BI -e \ INT
-Phred-scaled gap extension sequencing error probability. Reducing
-.I INT
-leads to longer indels. [20]
-.TP
-.BI -h \ INT
-Coefficient for modeling homopolymer errors. Given an
-.IR l -long
-homopolymer
-run, the sequencing error of an indel of size
-.I s
-is modeled as
-.IR INT * s / l .
-[100]
-.TP
-.B -I
-Do not perform INDEL calling
-.TP
-.BI -L \ INT
-Skip INDEL calling if the average per-sample depth is above
-.IR INT .
-[250]
-.TP
-.BI -o \ INT
-Phred-scaled gap open sequencing error probability. Reducing
-.I INT
-leads to more indel calls. [40]
-.TP
-.BI -p
-Apply -m and -F thresholds per sample to increase sensitivity of calling.
-By default both options are applied to reads pooled from all samples.
-.TP
-.BI -P \ STR
-Comma dilimited list of platforms (determined by
-.BR @RG-PL )
-from which indel candidates are obtained. It is recommended to collect
-indel candidates from sequencing technologies that have low indel error
-rate such as ILLUMINA. [all]
-.RE
-
-.TP
-.B reheader
-samtools reheader <in.header.sam> <in.bam>
-
-Replace the header in
-.I in.bam
-with the header in
-.I in.header.sam.
-This command is much faster than replacing the header with a
-BAM->SAM->BAM conversion.
-
-.TP
-.B cat
-samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [ ... ]
-
-Concatenate BAMs. The sequence dictionary of each input BAM must be identical,
-although this command does not check this. This command uses a similar trick
-to
-.B reheader
-which enables fast BAM concatenation.
-
-.TP
-.B sort
-samtools sort [-nof] [-m maxMem] <in.bam> <out.prefix>
-
-Sort alignments by leftmost coordinates. File
-.I <out.prefix>.bam
-will be created. This command may also create temporary files
-.I <out.prefix>.%d.bam
-when the whole alignment cannot be fitted into memory (controlled by
-option -m).
-
-.B OPTIONS:
-.RS
-.TP 8
-.B -o
-Output the final alignment to the standard output.
-.TP
-.B -n
-Sort by read names rather than by chromosomal coordinates
-.TP
-.B -f
-Use
-.I <out.prefix>
-as the full output path and do not append
-.I .bam
-suffix.
-.TP
-.BI -m \ INT
-Approximately the maximum required memory. [500000000]
-.RE
-
-.TP
-.B merge
-samtools merge [-nur1f] [-h inh.sam] [-R reg] <out.bam> <in1.bam> <in2.bam> [...]
-
-Merge multiple sorted alignments.
-The header reference lists of all the input BAM files, and the @SQ headers of
-.IR inh.sam ,
-if any, must all refer to the same set of reference sequences.
-The header reference list and (unless overridden by
-.BR -h )
-`@' headers of
-.I in1.bam
-will be copied to
-.IR out.bam ,
-and the headers of other files will be ignored.
-
-.B OPTIONS:
-.RS
-.TP 8
-.B -1
-Use zlib compression level 1 to comrpess the output
-.TP
-.B -f
-Force to overwrite the output file if present.
-.TP 8
-.BI -h \ FILE
-Use the lines of
-.I FILE
-as `@' headers to be copied to
-.IR out.bam ,
-replacing any header lines that would otherwise be copied from
-.IR in1.bam .
-.RI ( FILE
-is actually in SAM format, though any alignment records it may contain
-are ignored.)
-.TP
-.B -n
-The input alignments are sorted by read names rather than by chromosomal
-coordinates
-.TP
-.BI -R \ STR
-Merge files in the specified region indicated by
-.I STR
-[null]
-.TP
-.B -r
-Attach an RG tag to each alignment. The tag value is inferred from file names.
-.TP
-.B -u
-Uncompressed BAM output
-.RE
-
-.TP
-.B index
-samtools index <aln.bam>
-
-Index sorted alignment for fast random access. Index file
-.I <aln.bam>.bai
-will be created.
-
-.TP
-.B idxstats
-samtools idxstats <aln.bam>
-
-Retrieve and print stats in the index file. The output is TAB delimited
-with each line consisting of reference sequence name, sequence length, #
-mapped reads and # unmapped reads.
-
-.TP
-.B faidx
-samtools faidx <ref.fasta> [region1 [...]]
-
-Index reference sequence in the FASTA format or extract subsequence from
-indexed reference sequence. If no region is specified,
-.B faidx
-will index the file and create
-.I <ref.fasta>.fai
-on the disk. If regions are speficified, the subsequences will be
-retrieved and printed to stdout in the FASTA format. The input file can
-be compressed in the
-.B RAZF
-format.
-
-.TP
-.B fixmate
-samtools fixmate <in.nameSrt.bam> <out.bam>
-
-Fill in mate coordinates, ISIZE and mate related flags from a
-name-sorted alignment.
-
-.TP
-.B rmdup
-samtools rmdup [-sS] <input.srt.bam> <out.bam>
-
-Remove potential PCR duplicates: if multiple read pairs have identical
-external coordinates, only retain the pair with highest mapping quality.
-In the paired-end mode, this command
-.B ONLY
-works with FR orientation and requires ISIZE is correctly set. It does
-not work for unpaired reads (e.g. two ends mapped to different
-chromosomes or orphan reads).
-
-.B OPTIONS:
-.RS
-.TP 8
-.B -s
-Remove duplicate for single-end reads. By default, the command works for
-paired-end reads only.
-.TP 8
-.B -S
-Treat paired-end reads and single-end reads.
-.RE
-
-.TP
-.B calmd
-samtools calmd [-EeubSr] [-C capQcoef] <aln.bam> <ref.fasta>
-
-Generate the MD tag. If the MD tag is already present, this command will
-give a warning if the MD tag generated is different from the existing
-tag. Output SAM by default.
-
-.B OPTIONS:
-.RS
-.TP 8
-.B -A
-When used jointly with
-.B -r
-this option overwrites the original base quality.
-.TP 8
-.B -e
-Convert a the read base to = if it is identical to the aligned reference
-base. Indel caller does not support the = bases at the moment.
-.TP
-.B -u
-Output uncompressed BAM
-.TP
-.B -b
-Output compressed BAM
-.TP
-.B -S
-The input is SAM with header lines
-.TP
-.BI -C \ INT
-Coefficient to cap mapping quality of poorly mapped reads. See the
-.B pileup
-command for details. [0]
-.TP
-.B -r
-Compute the BQ tag (without -A) or cap base quality by BAQ (with -A).
-.TP
-.B -E
-Extended BAQ calculation. This option trades specificity for sensitivity, though the
-effect is minor.
-.RE
-
-.TP
-.B targetcut
-samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>
-
-This command identifies target regions by examining the continuity of read depth, computes
-haploid consensus sequences of targets and outputs a SAM with each sequence corresponding
-to a target. When option
-.B -f
-is in use, BAQ will be applied. This command is
-.B only
-designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)].
-.RE
-
-.TP
-.B phase
-samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] <in.bam>
-
-Call and phase heterozygous SNPs.
-.B OPTIONS:
-.RS
-.TP 8
-.B -A
-Drop reads with ambiguous phase.
-.TP 8
-.BI -b \ STR
-Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file
-.BR STR .0.bam
-and phase-1 reads in
-.BR STR .1.bam.
-Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads
-with switch errors will be saved in
-.BR STR .chimeric.bam.
-[null]
-.TP
-.B -F
-Do not attempt to fix chimeric reads.
-.TP
-.BI -k \ INT
-Maximum length for local phasing. [13]
-.TP
-.BI -q \ INT
-Minimum Phred-scaled LOD to call a heterozygote. [40]
-.TP
-.BI -Q \ INT
-Minimum base quality to be used in het calling. [13]
-.RE
-
-.SH BCFTOOLS COMMANDS AND OPTIONS
-
-.TP 10
-.B view
-.B bcftools view
-.RB [ \-AbFGNQSucgv ]
-.RB [ \-D
-.IR seqDict ]
-.RB [ \-l
-.IR listLoci ]
-.RB [ \-s
-.IR listSample ]
-.RB [ \-i
-.IR gapSNPratio ]
-.RB [ \-t
-.IR mutRate ]
-.RB [ \-p
-.IR varThres ]
-.RB [ \-m
-.IR varThres ]
-.RB [ \-P
-.IR prior ]
-.RB [ \-1
-.IR nGroup1 ]
-.RB [ \-d
-.IR minFrac ]
-.RB [ \-U
-.IR nPerm ]
-.RB [ \-X
-.IR permThres ]
-.RB [ \-T
-.IR trioType ]
-.I in.bcf
-.RI [ region ]
-
-Convert between BCF and VCF, call variant candidates and estimate allele
-frequencies.
-
-.RS
-.TP
-.B Input/Output Options:
-.TP 10
-.B -A
-Retain all possible alternate alleles at variant sites. By default, the view
-command discards unlikely alleles.
-.TP 10
-.B -b
-Output in the BCF format. The default is VCF.
-.TP
-.BI -D \ FILE
-Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null]
-.TP
-.B -F
-Indicate PL is generated by r921 or before (ordering is different).
-.TP
-.B -G
-Suppress all individual genotype information.
-.TP
-.BI -l \ FILE
-List of sites at which information are outputted [all sites]
-.TP
-.B -N
-Skip sites where the REF field is not A/C/G/T
-.TP
-.B -Q
-Output the QCALL likelihood format
-.TP
-.BI -s \ FILE
-List of samples to use. The first column in the input gives the sample names
-and the second gives the ploidy, which can only be 1 or 2. When the 2nd column
-is absent, the sample ploidy is assumed to be 2. In the output, the ordering of
-samples will be identical to the one in
-.IR FILE .
-[null]
-.TP
-.B -S
-The input is VCF instead of BCF.
-.TP
-.B -u
-Uncompressed BCF output (force -b).
-.TP
-.B Consensus/Variant Calling Options:
-.TP 10
-.B -c
-Call variants using Bayesian inference. This option automatically invokes option
-.BR -e .
-.TP
-.BI -d \ FLOAT
-When
-.B -v
-is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0]
-.TP
-.B -e
-Perform max-likelihood inference only, including estimating the site allele frequency,
-testing Hardy-Weinberg equlibrium and testing associations with LRT.
-.TP
-.B -g
-Call per-sample genotypes at variant sites (force -c)
-.TP
-.BI -i \ FLOAT
-Ratio of INDEL-to-SNP mutation rate [0.15]
-.TP
-.BI -m \ FLOAT
-New model for improved multiallelic and rare-variant calling. Another
-ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The
-parameter seems robust and the actual value usually does not affect the results
-much; a good value to use is 0.99. This is the recommended calling method. [0]
-.TP
-.BI -p \ FLOAT
-A site is considered to be a variant if P(ref|D)<FLOAT [0.5]
-.TP
-.BI -P \ STR
-Prior or initial allele frequency spectrum. If STR can be
-.IR full ,
-.IR cond2 ,
-.I flat
-or the file consisting of error output from a previous variant calling
-run.
-.TP
-.BI -t \ FLOAT
-Scaled muttion rate for variant calling [0.001]
-.TP
-.BI -T \ STR
-Enable pair/trio calling. For trio calling, option
-.B -s
-is usually needed to be applied to configure the trio members and their ordering.
-In the file supplied to the option
-.BR -s ,
-the first sample must be the child, the second the father and the third the mother.
-The valid values of
-.I STR
-are `pair', `trioauto', `trioxd' and `trioxs', where `pair' calls differences between two input samples, and `trioxd' (`trioxs') specifies that the input
-is from the X chromosome non-PAR regions and the child is a female (male). [null]
-.TP
-.B -v
-Output variant sites only (force -c)
-.TP
-.B Contrast Calling and Association Test Options:
-.TP
-.BI -1 \ INT
-Number of group-1 samples. This option is used for dividing the samples into
-two groups for contrast SNP calling or association test.
-When this option is in use, the following VCF INFO will be outputted:
-PC2, PCHI2 and QCHI2. [0]
-.TP
-.BI -U \ INT
-Number of permutations for association test (effective only with
-.BR -1 )
-[0]
-.TP
-.BI -X \ FLOAT
-Only perform permutations for P(chi^2)<FLOAT (effective only with
-.BR -U )
-[0.01]
-.RE
-
-.TP
-.B index
-.B bcftools index
-.I in.bcf
-
-Index sorted BCF for random access.
-.RE
-
-.TP
-.B cat
-.B bcftools cat
-.I in1.bcf
-.RI [ "in2.bcf " [ ... "]]]"
-
-Concatenate BCF files. The input files are required to be sorted and
-have identical samples appearing in the same order.
-.RE
-.SH SAM FORMAT
-
-Sequence Alignment/Map (SAM) format is TAB-delimited. Apart from the header lines, which are started
-with the `@' symbol, each alignment line consists of:
-
-.TS
-center box;
-cb | cb | cb
-n | l | l .
-Col Field Description
-_
-1 QNAME Query template/pair NAME
-2 FLAG bitwise FLAG
-3 RNAME Reference sequence NAME
-4 POS 1-based leftmost POSition/coordinate of clipped sequence
-5 MAPQ MAPping Quality (Phred-scaled)
-6 CIAGR extended CIGAR string
-7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
-8 MPOS 1-based Mate POSistion
-9 TLEN inferred Template LENgth (insert size)
-10 SEQ query SEQuence on the same strand as the reference
-11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
-12+ OPT variable OPTional fields in the format TAG:VTYPE:VALUE
-.TE
-
-.PP
-Each bit in the FLAG field is defined as:
-
-.TS
-center box;
-cb | cb | cb
-l | c | l .
-Flag Chr Description
-_
-0x0001 p the read is paired in sequencing
-0x0002 P the read is mapped in a proper pair
-0x0004 u the query sequence itself is unmapped
-0x0008 U the mate is unmapped
-0x0010 r strand of the query (1 for reverse)
-0x0020 R strand of the mate
-0x0040 1 the read is the first read in a pair
-0x0080 2 the read is the second read in a pair
-0x0100 s the alignment is not primary
-0x0200 f the read fails platform/vendor quality checks
-0x0400 d the read is either a PCR or an optical duplicate
-.TE
-
-where the second column gives the string representation of the FLAG field.
-
-.SH VCF FORMAT
-
-The Variant Call Format (VCF) is a TAB-delimited format with each data line consists of the following fields:
-.TS
-center box;
-cb | cb | cb
-n | l | l .
-Col Field Description
-_
-1 CHROM CHROMosome name
-2 POS the left-most POSition of the variant
-3 ID unique variant IDentifier
-4 REF the REFerence allele
-5 ALT the ALTernate allele(s), separated by comma
-6 QUAL variant/reference QUALity
-7 FILTER FILTers applied
-8 INFO INFOrmation related to the variant, separated by semi-colon
-9 FORMAT FORMAT of the genotype fields, separated by colon (optional)
-10+ SAMPLE SAMPLE genotypes and per-sample information (optional)
-.TE
-
-.PP
-The following table gives the
-.B INFO
-tags used by samtools and bcftools.
-
-.TS
-center box;
-cb | cb | cb
-l | l | l .
-Tag Format Description
-_
-AF1 double Max-likelihood estimate of the site allele frequency (AF) of the first ALT allele
-DP int Raw read depth (without quality filtering)
-DP4 int[4] # high-quality reference forward bases, ref reverse, alternate for and alt rev bases
-FQ int Consensus quality. Positive: sample genotypes different; negative: otherwise
-MQ int Root-Mean-Square mapping quality of covering reads
-PC2 int[2] Phred probability of AF in group1 samples being larger (,smaller) than in group2
-PCHI2 double Posterior weighted chi^2 P-value between group1 and group2 samples
-PV4 double[4] P-value for strand bias, baseQ bias, mapQ bias and tail distance bias
-QCHI2 int Phred-scaled PCHI2
-RP int # permutations yielding a smaller PCHI2
-CLR int Phred log ratio of genotype likelihoods with and without the trio/pair constraint
-UGT string Most probable genotype configuration without the trio constraint
-CGT string Most probable configuration with the trio constraint
-VDB float Tests variant positions within reads. Intended for filtering RNA-seq artifacts around splice sites
-RPB float Mann-Whitney rank-sum test for tail distance bias
-HWE float Hardy-Weinberg equilibrium test, Wigginton et al., PMID: 15789306
-.TE
-
-.SH EXAMPLES
-.IP o 2
-Import SAM to BAM when
-.B @SQ
-lines are present in the header:
-
- samtools view -bS aln.sam > aln.bam
-
-If
-.B @SQ
-lines are absent:
-
- samtools faidx ref.fa
- samtools view -bt ref.fa.fai aln.sam > aln.bam
-
-where
-.I ref.fa.fai
-is generated automatically by the
-.B faidx
-command.
-
-.IP o 2
-Attach the
-.B RG
-tag while merging sorted alignments:
-
- perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\***@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt
- samtools merge -rh rg.txt merged.bam ga.bam 454.bam
-
-The value in a
-.B RG
-tag is determined by the file name the read is coming from. In this
-example, in the
-.IR merged.bam ,
-reads from
-.I ga.bam
-will be attached
-.IR RG:Z:ga ,
-while reads from
-.I 454.bam
-will be attached
-.IR RG:Z:454 .
-
-.IP o 2
-Call SNPs and short INDELs for one diploid individual:
-
- samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf
- bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf
-
-The
-.B -D
-option of varFilter controls the maximum read depth, which should be
-adjusted to about twice the average read depth. One may consider to add
-.B -C50
-to
-.B mpileup
-if mapping quality is overestimated for reads containing excessive
-mismatches. Applying this option usually helps
-.B BWA-short
-but may not other mappers.
-
-.IP o 2
-Generate the consensus sequence for one diploid individual:
-
- samtools mpileup -uf ref.fa aln.bam | bcftools view -cg - | vcfutils.pl vcf2fq > cns.fq
-
-.IP o 2
-Call somatic mutations from a pair of samples:
-
- samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair - > var.bcf
-
-In the output INFO field,
-.I CLR
-gives the Phred-log ratio between the likelihood by treating the
-two samples independently, and the likelihood by requiring the genotype to be identical.
-This
-.I CLR
-is effectively a score measuring the confidence of somatic calls. The higher the better.
-
-.IP o 2
-Call de novo and somatic mutations from a family trio:
-
- samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair -s samples.txt - > var.bcf
-
-File
-.I samples.txt
-should consist of three lines specifying the member and order of samples (in the order of child-father-mother).
-Similarly,
-.I CLR
-gives the Phred-log likelihood ratio with and without the trio constraint.
-.I UGT
-shows the most likely genotype configuration without the trio constraint, and
-.I CGT
-gives the most likely genotype configuration satisfying the trio constraint.
-
-.IP o 2
-Phase one individual:
-
- samtools calmd -AEur aln.bam ref.fa | samtools phase -b prefix - > phase.out
-
-The
-.B calmd
-command is used to reduce false heterozygotes around INDELs.
-
-.IP o 2
-Call SNPs and short indels for multiple diploid individuals:
-
- samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf
- bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf
-
-Individuals are identified from the
-.B SM
-tags in the
-.B @RG
-header lines. Individuals can be pooled in one alignment file; one
-individual can also be separated into multiple files. The
-.B -P
-option specifies that indel candidates should be collected only from
-read groups with the
-.B @RG-PL
-tag set to
-.IR ILLUMINA .
-Collecting indel candidates from reads sequenced by an indel-prone
-technology may affect the performance of indel calling.
-
-Note that there is a new calling model which can be invoked by
-
- bcftools view -m0.99 ...
-
-which fixes some severe limitations of the default method.
-
-For filtering, best results seem to be achieved by first applying the
-.IR SnpGap
-filter and then applying some machine learning approach
-
- vcf-annotate -f SnpGap=n
- vcf filter ...
-
-Both can be found in the
-.B vcftools
-and
-.B htslib
-package (links below).
-
-.IP o 2
-Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals:
-
- samtools mpileup -Igf ref.fa *.bam > all.bcf
- bcftools view -bl sites.list all.bcf > sites.bcf
- bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
- bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
- bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs
- ......
-
-where
-.I sites.list
-contains the list of sites with each line consisting of the reference
-sequence name and position. The following
-.B bcftools
-commands estimate AFS by EM.
-
-.IP o 2
-Dump BAQ applied alignment for other SNP callers:
-
- samtools calmd -bAr aln.bam > aln.baq.bam
-
-It adds and corrects the
-.B NM
-and
-.B MD
-tags at the same time. The
-.B calmd
-command also comes with the
-.B -C
-option, the same as the one in
-.B pileup
-and
-.BR mpileup .
-Apply if it helps.
-
-.SH LIMITATIONS
-.PP
-.IP o 2
-Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c.
-.IP o 2
-Samtools paired-end rmdup does not work for unpaired reads (e.g. orphan
-reads or ends mapped to different chromosomes). If this is a concern,
-please use Picard's MarkDuplicate which correctly handles these cases,
-although a little slower.
-
-.SH AUTHOR
-.PP
-Heng Li from the Sanger Institute wrote the C version of samtools. Bob
-Handsaker from the Broad Institute implemented the BGZF library and Jue
-Ruan from Beijing Genomics Institute wrote the RAZF library. John
-Marshall and Petr Danecek contribute to the source code and various
-people from the 1000 Genomes Project have contributed to the SAM format
-specification.
-
-.SH SEE ALSO
-.PP
-Samtools website: <http://samtools.sourceforge.net>
-.br
-Samtools latest source: <https://github.com/samtools/samtools>
-.br
-VCFtools website with stable link to VCF specification: <http://vcftools.sourceforge.net>
-.br
-HTSlib website: <https://github.com/samtools/htslib>
diff --git a/sam/win32/._xcurses.h b/sam/win32/._xcurses.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/win32/._xcurses.h and /dev/null differ
diff --git a/sam/win32/._zconf.h b/sam/win32/._zconf.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/win32/._zconf.h and /dev/null differ
diff --git a/sam/win32/._zlib.h b/sam/win32/._zlib.h
deleted file mode 100644
index 94286bb..0000000
Binary files a/sam/win32/._zlib.h and /dev/null differ
diff --git a/sam/win32/xcurses.h b/sam/win32/xcurses.h
deleted file mode 100644
index 6f3ce19..0000000
--- a/sam/win32/xcurses.h
+++ /dev/null
@@ -1,1377 +0,0 @@
-/* Public Domain Curses */
-
-/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */
-
-/*----------------------------------------------------------------------*
- * PDCurses *
- *----------------------------------------------------------------------*/
-
-#ifndef __PDCURSES__
-#define __PDCURSES__ 1
-
-/*man-start**************************************************************
-
-PDCurses definitions list: (Only define those needed)
-
- XCURSES True if compiling for X11.
- PDC_RGB True if you want to use RGB color definitions
- (Red = 1, Green = 2, Blue = 4) instead of BGR.
- PDC_WIDE True if building wide-character support.
- PDC_DLL_BUILD True if building a Win32 DLL.
- NCURSES_MOUSE_VERSION Use the ncurses mouse API instead
- of PDCurses' traditional mouse API.
-
-PDCurses portable platform definitions list:
-
- PDC_BUILD Defines API build version.
- PDCURSES Enables access to PDCurses-only routines.
- XOPEN Always true.
- SYSVcurses True if you are compiling for SYSV portability.
- BSDcurses True if you are compiling for BSD portability.
-
-**man-end****************************************************************/
-
-#define PDC_BUILD 3401
-#define PDCURSES 1 /* PDCurses-only routines */
-#define XOPEN 1 /* X/Open Curses routines */
-#define SYSVcurses 1 /* System V Curses routines */
-#define BSDcurses 1 /* BSD Curses routines */
-#define CHTYPE_LONG 1 /* size of chtype; long */
-
-/*----------------------------------------------------------------------*/
-
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdio.h> /* Required by X/Open usage below */
-
-#ifdef PDC_WIDE
-# include <wchar.h>
-#endif
-
-#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS)
-extern "C"
-{
-# define bool _bool
-#endif
-
-/*----------------------------------------------------------------------
- *
- * PDCurses Manifest Constants
- *
- */
-
-#ifndef FALSE
-# define FALSE 0
-#endif
-#ifndef TRUE
-# define TRUE 1
-#endif
-#ifndef NULL
-# define NULL (void *)0
-#endif
-#ifndef ERR
-# define ERR (-1)
-#endif
-#ifndef OK
-# define OK 0
-#endif
-
-/*----------------------------------------------------------------------
- *
- * PDCurses Type Declarations
- *
- */
-
-typedef unsigned char bool; /* PDCurses Boolean type */
-
-#ifdef CHTYPE_LONG
-# if _LP64
-typedef unsigned int chtype;
-# else
-typedef unsigned long chtype; /* 16-bit attr + 16-bit char */
-# endif
-#else
-typedef unsigned short chtype; /* 8-bit attr + 8-bit char */
-#endif
-
-#ifdef PDC_WIDE
-typedef chtype cchar_t;
-#endif
-
-typedef chtype attr_t;
-
-/*----------------------------------------------------------------------
- *
- * PDCurses Mouse Interface -- SYSVR4, with extensions
- *
- */
-
-typedef struct
-{
- int x; /* absolute column, 0 based, measured in characters */
- int y; /* absolute row, 0 based, measured in characters */
- short button[3]; /* state of each button */
- int changes; /* flags indicating what has changed with the mouse */
-} MOUSE_STATUS;
-
-#define BUTTON_RELEASED 0x0000
-#define BUTTON_PRESSED 0x0001
-#define BUTTON_CLICKED 0x0002
-#define BUTTON_DOUBLE_CLICKED 0x0003
-#define BUTTON_TRIPLE_CLICKED 0x0004
-#define BUTTON_MOVED 0x0005 /* PDCurses */
-#define WHEEL_SCROLLED 0x0006 /* PDCurses */
-#define BUTTON_ACTION_MASK 0x0007 /* PDCurses */
-
-#define PDC_BUTTON_SHIFT 0x0008 /* PDCurses */
-#define PDC_BUTTON_CONTROL 0x0010 /* PDCurses */
-#define PDC_BUTTON_ALT 0x0020 /* PDCurses */
-#define BUTTON_MODIFIER_MASK 0x0038 /* PDCurses */
-
-#define MOUSE_X_POS (Mouse_status.x)
-#define MOUSE_Y_POS (Mouse_status.y)
-
-/*
- * Bits associated with the .changes field:
- * 3 2 1 0
- * 210987654321098765432109876543210
- * 1 <- button 1 has changed
- * 10 <- button 2 has changed
- * 100 <- button 3 has changed
- * 1000 <- mouse has moved
- * 10000 <- mouse position report
- * 100000 <- mouse wheel up
- * 1000000 <- mouse wheel down
- */
-
-#define PDC_MOUSE_MOVED 0x0008
-#define PDC_MOUSE_POSITION 0x0010
-#define PDC_MOUSE_WHEEL_UP 0x0020
-#define PDC_MOUSE_WHEEL_DOWN 0x0040
-
-#define A_BUTTON_CHANGED (Mouse_status.changes & 7)
-#define MOUSE_MOVED (Mouse_status.changes & PDC_MOUSE_MOVED)
-#define MOUSE_POS_REPORT (Mouse_status.changes & PDC_MOUSE_POSITION)
-#define BUTTON_CHANGED(x) (Mouse_status.changes & (1 << ((x) - 1)))
-#define BUTTON_STATUS(x) (Mouse_status.button[(x) - 1])
-#define MOUSE_WHEEL_UP (Mouse_status.changes & PDC_MOUSE_WHEEL_UP)
-#define MOUSE_WHEEL_DOWN (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN)
-
-/* mouse bit-masks */
-
-#define BUTTON1_RELEASED 0x00000001L
-#define BUTTON1_PRESSED 0x00000002L
-#define BUTTON1_CLICKED 0x00000004L
-#define BUTTON1_DOUBLE_CLICKED 0x00000008L
-#define BUTTON1_TRIPLE_CLICKED 0x00000010L
-#define BUTTON1_MOVED 0x00000010L /* PDCurses */
-
-#define BUTTON2_RELEASED 0x00000020L
-#define BUTTON2_PRESSED 0x00000040L
-#define BUTTON2_CLICKED 0x00000080L
-#define BUTTON2_DOUBLE_CLICKED 0x00000100L
-#define BUTTON2_TRIPLE_CLICKED 0x00000200L
-#define BUTTON2_MOVED 0x00000200L /* PDCurses */
-
-#define BUTTON3_RELEASED 0x00000400L
-#define BUTTON3_PRESSED 0x00000800L
-#define BUTTON3_CLICKED 0x00001000L
-#define BUTTON3_DOUBLE_CLICKED 0x00002000L
-#define BUTTON3_TRIPLE_CLICKED 0x00004000L
-#define BUTTON3_MOVED 0x00004000L /* PDCurses */
-
-/* For the ncurses-compatible functions only, BUTTON4_PRESSED and
- BUTTON5_PRESSED are returned for mouse scroll wheel up and down;
- otherwise PDCurses doesn't support buttons 4 and 5 */
-
-#define BUTTON4_RELEASED 0x00008000L
-#define BUTTON4_PRESSED 0x00010000L
-#define BUTTON4_CLICKED 0x00020000L
-#define BUTTON4_DOUBLE_CLICKED 0x00040000L
-#define BUTTON4_TRIPLE_CLICKED 0x00080000L
-
-#define BUTTON5_RELEASED 0x00100000L
-#define BUTTON5_PRESSED 0x00200000L
-#define BUTTON5_CLICKED 0x00400000L
-#define BUTTON5_DOUBLE_CLICKED 0x00800000L
-#define BUTTON5_TRIPLE_CLICKED 0x01000000L
-
-#define MOUSE_WHEEL_SCROLL 0x02000000L /* PDCurses */
-#define BUTTON_MODIFIER_SHIFT 0x04000000L /* PDCurses */
-#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */
-#define BUTTON_MODIFIER_ALT 0x10000000L /* PDCurses */
-
-#define ALL_MOUSE_EVENTS 0x1fffffffL
-#define REPORT_MOUSE_POSITION 0x20000000L
-
-/* ncurses mouse interface */
-
-typedef unsigned long mmask_t;
-
-typedef struct
-{
- short id; /* unused, always 0 */
- int x, y, z; /* x, y same as MOUSE_STATUS; z unused */
- mmask_t bstate; /* equivalent to changes + button[], but
- in the same format as used for mousemask() */
-} MEVENT;
-
-#ifdef NCURSES_MOUSE_VERSION
-# define BUTTON_SHIFT BUTTON_MODIFIER_SHIFT
-# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL
-# define BUTTON_CTRL BUTTON_MODIFIER_CONTROL
-# define BUTTON_ALT BUTTON_MODIFIER_ALT
-#else
-# define BUTTON_SHIFT PDC_BUTTON_SHIFT
-# define BUTTON_CONTROL PDC_BUTTON_CONTROL
-# define BUTTON_ALT PDC_BUTTON_ALT
-#endif
-
-/*----------------------------------------------------------------------
- *
- * PDCurses Structure Definitions
- *
- */
-
-typedef struct _win /* definition of a window */
-{
- int _cury; /* current pseudo-cursor */
- int _curx;
- int _maxy; /* max window coordinates */
- int _maxx;
- int _begy; /* origin on screen */
- int _begx;
- int _flags; /* window properties */
- chtype _attrs; /* standard attributes and colors */
- chtype _bkgd; /* background, normally blank */
- bool _clear; /* causes clear at next refresh */
- bool _leaveit; /* leaves cursor where it is */
- bool _scroll; /* allows window scrolling */
- bool _nodelay; /* input character wait flag */
- bool _immed; /* immediate update flag */
- bool _sync; /* synchronise window ancestors */
- bool _use_keypad; /* flags keypad key mode active */
- chtype **_y; /* pointer to line pointer array */
- int *_firstch; /* first changed character in line */
- int *_lastch; /* last changed character in line */
- int _tmarg; /* top of scrolling region */
- int _bmarg; /* bottom of scrolling region */
- int _delayms; /* milliseconds of delay for getch() */
- int _parx, _pary; /* coords relative to parent (0,0) */
- struct _win *_parent; /* subwin's pointer to parent win */
-} WINDOW;
-
-/* Avoid using the SCREEN struct directly -- use the corresponding
- functions if possible. This struct may eventually be made private. */
-
-typedef struct
-{
- bool alive; /* if initscr() called, and not endwin() */
- bool autocr; /* if cr -> lf */
- bool cbreak; /* if terminal unbuffered */
- bool echo; /* if terminal echo */
- bool raw_inp; /* raw input mode (v. cooked input) */
- bool raw_out; /* raw output mode (7 v. 8 bits) */
- bool audible; /* FALSE if the bell is visual */
- bool mono; /* TRUE if current screen is mono */
- bool resized; /* TRUE if TERM has been resized */
- bool orig_attr; /* TRUE if we have the original colors */
- short orig_fore; /* original screen foreground color */
- short orig_back; /* original screen foreground color */
- int cursrow; /* position of physical cursor */
- int curscol; /* position of physical cursor */
- int visibility; /* visibility of cursor */
- int orig_cursor; /* original cursor size */
- int lines; /* new value for LINES */
- int cols; /* new value for COLS */
- unsigned long _trap_mbe; /* trap these mouse button events */
- unsigned long _map_mbe_to_key; /* map mouse buttons to slk */
- int mouse_wait; /* time to wait (in ms) for a
- button release after a press, in
- order to count it as a click */
- int slklines; /* lines in use by slk_init() */
- WINDOW *slk_winptr; /* window for slk */
- int linesrippedoff; /* lines ripped off via ripoffline() */
- int linesrippedoffontop; /* lines ripped off on
- top via ripoffline() */
- int delaytenths; /* 1/10ths second to wait block
- getch() for */
- bool _preserve; /* TRUE if screen background
- to be preserved */
- int _restore; /* specifies if screen background
- to be restored, and how */
- bool save_key_modifiers; /* TRUE if each key modifiers saved
- with each key press */
- bool return_key_modifiers; /* TRUE if modifier keys are
- returned as "real" keys */
- bool key_code; /* TRUE if last key is a special key;
- used internally by get_wch() */
-#ifdef XCURSES
- int XcurscrSize; /* size of Xcurscr shared memory block */
- bool sb_on;
- int sb_viewport_y;
- int sb_viewport_x;
- int sb_total_y;
- int sb_total_x;
- int sb_cur_y;
- int sb_cur_x;
-#endif
- short line_color; /* color of line attributes - default -1 */
-} SCREEN;
-
-/*----------------------------------------------------------------------
- *
- * PDCurses External Variables
- *
- */
-
-#ifdef PDC_DLL_BUILD
-# ifdef CURSES_LIBRARY
-# define PDCEX __declspec(dllexport) extern
-# else
-# define PDCEX __declspec(dllimport)
-# endif
-#else
-# define PDCEX extern
-#endif
-
-PDCEX int LINES; /* terminal height */
-PDCEX int COLS; /* terminal width */
-PDCEX WINDOW *stdscr; /* the default screen window */
-PDCEX WINDOW *curscr; /* the current screen image */
-PDCEX SCREEN *SP; /* curses variables */
-PDCEX MOUSE_STATUS Mouse_status;
-PDCEX int COLORS;
-PDCEX int COLOR_PAIRS;
-PDCEX int TABSIZE;
-PDCEX chtype acs_map[]; /* alternate character set map */
-PDCEX char ttytype[]; /* terminal name/description */
-
-/*man-start**************************************************************
-
-PDCurses Text Attributes
-========================
-
-Originally, PDCurses used a short (16 bits) for its chtype. To include
-color, a number of things had to be sacrificed from the strict Unix and
-System V support. The main problem was fitting all character attributes
-and color into an unsigned char (all 8 bits!).
-
-Today, PDCurses by default uses a long (32 bits) for its chtype, as in
-System V. The short chtype is still available, by undefining CHTYPE_LONG
-and rebuilding the library.
-
-The following is the structure of a win->_attrs chtype:
-
-short form:
-
--------------------------------------------------
-|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
--------------------------------------------------
- color number | attrs | character eg 'a'
-
-The available non-color attributes are bold, reverse and blink. Others
-have no effect. The high order char is an index into an array of
-physical colors (defined in color.c) -- 32 foreground/background color
-pairs (5 bits) plus 3 bits for other attributes.
-
-long form:
-
-----------------------------------------------------------------------------
-|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0|
-----------------------------------------------------------------------------
- color number | modifiers | character eg 'a'
-
-The available non-color attributes are bold, underline, invisible,
-right-line, left-line, protect, reverse and blink. 256 color pairs (8
-bits), 8 bits for other attributes, and 16 bits for character data.
-
-**man-end****************************************************************/
-
-/*** Video attribute macros ***/
-
-#define A_NORMAL (chtype)0
-
-#ifdef CHTYPE_LONG
-# define A_ALTCHARSET (chtype)0x00010000
-# define A_RIGHTLINE (chtype)0x00020000
-# define A_LEFTLINE (chtype)0x00040000
-# define A_INVIS (chtype)0x00080000
-# define A_UNDERLINE (chtype)0x00100000
-# define A_REVERSE (chtype)0x00200000
-# define A_BLINK (chtype)0x00400000
-# define A_BOLD (chtype)0x00800000
-
-# define A_ATTRIBUTES (chtype)0xffff0000
-# define A_CHARTEXT (chtype)0x0000ffff
-# define A_COLOR (chtype)0xff000000
-
-# define A_ITALIC A_INVIS
-# define A_PROTECT (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE)
-
-# define PDC_ATTR_SHIFT 19
-# define PDC_COLOR_SHIFT 24
-#else
-# define A_BOLD (chtype)0x0100 /* X/Open */
-# define A_REVERSE (chtype)0x0200 /* X/Open */
-# define A_BLINK (chtype)0x0400 /* X/Open */
-
-# define A_ATTRIBUTES (chtype)0xff00 /* X/Open */
-# define A_CHARTEXT (chtype)0x00ff /* X/Open */
-# define A_COLOR (chtype)0xf800 /* System V */
-
-# define A_ALTCHARSET A_NORMAL /* X/Open */
-# define A_PROTECT A_NORMAL /* X/Open */
-# define A_UNDERLINE A_NORMAL /* X/Open */
-
-# define A_LEFTLINE A_NORMAL
-# define A_RIGHTLINE A_NORMAL
-# define A_ITALIC A_NORMAL
-# define A_INVIS A_NORMAL
-
-# define PDC_ATTR_SHIFT 8
-# define PDC_COLOR_SHIFT 11
-#endif
-
-#define A_STANDOUT (A_REVERSE | A_BOLD) /* X/Open */
-#define A_DIM A_NORMAL
-
-#define CHR_MSK A_CHARTEXT /* Obsolete */
-#define ATR_MSK A_ATTRIBUTES /* Obsolete */
-#define ATR_NRM A_NORMAL /* Obsolete */
-
-/* For use with attr_t -- X/Open says, "these shall be distinct", so
- this is a non-conforming implementation. */
-
-#define WA_ALTCHARSET A_ALTCHARSET
-#define WA_BLINK A_BLINK
-#define WA_BOLD A_BOLD
-#define WA_DIM A_DIM
-#define WA_INVIS A_INVIS
-#define WA_LEFT A_LEFTLINE
-#define WA_PROTECT A_PROTECT
-#define WA_REVERSE A_REVERSE
-#define WA_RIGHT A_RIGHTLINE
-#define WA_STANDOUT A_STANDOUT
-#define WA_UNDERLINE A_UNDERLINE
-
-#define WA_HORIZONTAL A_NORMAL
-#define WA_LOW A_NORMAL
-#define WA_TOP A_NORMAL
-#define WA_VERTICAL A_NORMAL
-
-/*** Alternate character set macros ***/
-
-/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET
- 'n' = 16-bit chtype; it gets the fallback set because no bit is
- available for A_ALTCHARSET */
-
-#ifdef CHTYPE_LONG
-# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET)
-#else
-# define ACS_PICK(w, n) ((chtype)n)
-#endif
-
-/* VT100-compatible symbols -- box chars */
-
-#define ACS_ULCORNER ACS_PICK('l', '+')
-#define ACS_LLCORNER ACS_PICK('m', '+')
-#define ACS_URCORNER ACS_PICK('k', '+')
-#define ACS_LRCORNER ACS_PICK('j', '+')
-#define ACS_RTEE ACS_PICK('u', '+')
-#define ACS_LTEE ACS_PICK('t', '+')
-#define ACS_BTEE ACS_PICK('v', '+')
-#define ACS_TTEE ACS_PICK('w', '+')
-#define ACS_HLINE ACS_PICK('q', '-')
-#define ACS_VLINE ACS_PICK('x', '|')
-#define ACS_PLUS ACS_PICK('n', '+')
-
-/* VT100-compatible symbols -- other */
-
-#define ACS_S1 ACS_PICK('o', '-')
-#define ACS_S9 ACS_PICK('s', '_')
-#define ACS_DIAMOND ACS_PICK('`', '+')
-#define ACS_CKBOARD ACS_PICK('a', ':')
-#define ACS_DEGREE ACS_PICK('f', '\'')
-#define ACS_PLMINUS ACS_PICK('g', '#')
-#define ACS_BULLET ACS_PICK('~', 'o')
-
-/* Teletype 5410v1 symbols -- these are defined in SysV curses, but
- are not well-supported by most terminals. Stick to VT100 characters
- for optimum portability. */
-
-#define ACS_LARROW ACS_PICK(',', '<')
-#define ACS_RARROW ACS_PICK('+', '>')
-#define ACS_DARROW ACS_PICK('.', 'v')
-#define ACS_UARROW ACS_PICK('-', '^')
-#define ACS_BOARD ACS_PICK('h', '#')
-#define ACS_LANTERN ACS_PICK('i', '*')
-#define ACS_BLOCK ACS_PICK('0', '#')
-
-/* That goes double for these -- undocumented SysV symbols. Don't use
- them. */
-
-#define ACS_S3 ACS_PICK('p', '-')
-#define ACS_S7 ACS_PICK('r', '-')
-#define ACS_LEQUAL ACS_PICK('y', '<')
-#define ACS_GEQUAL ACS_PICK('z', '>')
-#define ACS_PI ACS_PICK('{', 'n')
-#define ACS_NEQUAL ACS_PICK('|', '+')
-#define ACS_STERLING ACS_PICK('}', 'L')
-
-/* Box char aliases */
-
-#define ACS_BSSB ACS_ULCORNER
-#define ACS_SSBB ACS_LLCORNER
-#define ACS_BBSS ACS_URCORNER
-#define ACS_SBBS ACS_LRCORNER
-#define ACS_SBSS ACS_RTEE
-#define ACS_SSSB ACS_LTEE
-#define ACS_SSBS ACS_BTEE
-#define ACS_BSSS ACS_TTEE
-#define ACS_BSBS ACS_HLINE
-#define ACS_SBSB ACS_VLINE
-#define ACS_SSSS ACS_PLUS
-
-/* cchar_t aliases */
-
-#ifdef PDC_WIDE
-# define WACS_ULCORNER (&(acs_map['l']))
-# define WACS_LLCORNER (&(acs_map['m']))
-# define WACS_URCORNER (&(acs_map['k']))
-# define WACS_LRCORNER (&(acs_map['j']))
-# define WACS_RTEE (&(acs_map['u']))
-# define WACS_LTEE (&(acs_map['t']))
-# define WACS_BTEE (&(acs_map['v']))
-# define WACS_TTEE (&(acs_map['w']))
-# define WACS_HLINE (&(acs_map['q']))
-# define WACS_VLINE (&(acs_map['x']))
-# define WACS_PLUS (&(acs_map['n']))
-
-# define WACS_S1 (&(acs_map['o']))
-# define WACS_S9 (&(acs_map['s']))
-# define WACS_DIAMOND (&(acs_map['`']))
-# define WACS_CKBOARD (&(acs_map['a']))
-# define WACS_DEGREE (&(acs_map['f']))
-# define WACS_PLMINUS (&(acs_map['g']))
-# define WACS_BULLET (&(acs_map['~']))
-
-# define WACS_LARROW (&(acs_map[',']))
-# define WACS_RARROW (&(acs_map['+']))
-# define WACS_DARROW (&(acs_map['.']))
-# define WACS_UARROW (&(acs_map['-']))
-# define WACS_BOARD (&(acs_map['h']))
-# define WACS_LANTERN (&(acs_map['i']))
-# define WACS_BLOCK (&(acs_map['0']))
-
-# define WACS_S3 (&(acs_map['p']))
-# define WACS_S7 (&(acs_map['r']))
-# define WACS_LEQUAL (&(acs_map['y']))
-# define WACS_GEQUAL (&(acs_map['z']))
-# define WACS_PI (&(acs_map['{']))
-# define WACS_NEQUAL (&(acs_map['|']))
-# define WACS_STERLING (&(acs_map['}']))
-
-# define WACS_BSSB WACS_ULCORNER
-# define WACS_SSBB WACS_LLCORNER
-# define WACS_BBSS WACS_URCORNER
-# define WACS_SBBS WACS_LRCORNER
-# define WACS_SBSS WACS_RTEE
-# define WACS_SSSB WACS_LTEE
-# define WACS_SSBS WACS_BTEE
-# define WACS_BSSS WACS_TTEE
-# define WACS_BSBS WACS_HLINE
-# define WACS_SBSB WACS_VLINE
-# define WACS_SSSS WACS_PLUS
-#endif
-
-/*** Color macros ***/
-
-#define COLOR_BLACK 0
-
-#ifdef PDC_RGB /* RGB */
-# define COLOR_RED 1
-# define COLOR_GREEN 2
-# define COLOR_BLUE 4
-#else /* BGR */
-# define COLOR_BLUE 1
-# define COLOR_GREEN 2
-# define COLOR_RED 4
-#endif
-
-#define COLOR_CYAN (COLOR_BLUE | COLOR_GREEN)
-#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE)
-#define COLOR_YELLOW (COLOR_RED | COLOR_GREEN)
-
-#define COLOR_WHITE 7
-
-/*----------------------------------------------------------------------
- *
- * Function and Keypad Key Definitions.
- * Many are just for compatibility.
- *
- */
-
-#define KEY_CODE_YES 0x100 /* If get_wch() gives a key code */
-
-#define KEY_BREAK 0x101 /* Not on PC KBD */
-#define KEY_DOWN 0x102 /* Down arrow key */
-#define KEY_UP 0x103 /* Up arrow key */
-#define KEY_LEFT 0x104 /* Left arrow key */
-#define KEY_RIGHT 0x105 /* Right arrow key */
-#define KEY_HOME 0x106 /* home key */
-#define KEY_BACKSPACE 0x107 /* not on pc */
-#define KEY_F0 0x108 /* function keys; 64 reserved */
-
-#define KEY_DL 0x148 /* delete line */
-#define KEY_IL 0x149 /* insert line */
-#define KEY_DC 0x14a /* delete character */
-#define KEY_IC 0x14b /* insert char or enter ins mode */
-#define KEY_EIC 0x14c /* exit insert char mode */
-#define KEY_CLEAR 0x14d /* clear screen */
-#define KEY_EOS 0x14e /* clear to end of screen */
-#define KEY_EOL 0x14f /* clear to end of line */
-#define KEY_SF 0x150 /* scroll 1 line forward */
-#define KEY_SR 0x151 /* scroll 1 line back (reverse) */
-#define KEY_NPAGE 0x152 /* next page */
-#define KEY_PPAGE 0x153 /* previous page */
-#define KEY_STAB 0x154 /* set tab */
-#define KEY_CTAB 0x155 /* clear tab */
-#define KEY_CATAB 0x156 /* clear all tabs */
-#define KEY_ENTER 0x157 /* enter or send (unreliable) */
-#define KEY_SRESET 0x158 /* soft/reset (partial/unreliable) */
-#define KEY_RESET 0x159 /* reset/hard reset (unreliable) */
-#define KEY_PRINT 0x15a /* print/copy */
-#define KEY_LL 0x15b /* home down/bottom (lower left) */
-#define KEY_ABORT 0x15c /* abort/terminate key (any) */
-#define KEY_SHELP 0x15d /* short help */
-#define KEY_LHELP 0x15e /* long help */
-#define KEY_BTAB 0x15f /* Back tab key */
-#define KEY_BEG 0x160 /* beg(inning) key */
-#define KEY_CANCEL 0x161 /* cancel key */
-#define KEY_CLOSE 0x162 /* close key */
-#define KEY_COMMAND 0x163 /* cmd (command) key */
-#define KEY_COPY 0x164 /* copy key */
-#define KEY_CREATE 0x165 /* create key */
-#define KEY_END 0x166 /* end key */
-#define KEY_EXIT 0x167 /* exit key */
-#define KEY_FIND 0x168 /* find key */
-#define KEY_HELP 0x169 /* help key */
-#define KEY_MARK 0x16a /* mark key */
-#define KEY_MESSAGE 0x16b /* message key */
-#define KEY_MOVE 0x16c /* move key */
-#define KEY_NEXT 0x16d /* next object key */
-#define KEY_OPEN 0x16e /* open key */
-#define KEY_OPTIONS 0x16f /* options key */
-#define KEY_PREVIOUS 0x170 /* previous object key */
-#define KEY_REDO 0x171 /* redo key */
-#define KEY_REFERENCE 0x172 /* ref(erence) key */
-#define KEY_REFRESH 0x173 /* refresh key */
-#define KEY_REPLACE 0x174 /* replace key */
-#define KEY_RESTART 0x175 /* restart key */
-#define KEY_RESUME 0x176 /* resume key */
-#define KEY_SAVE 0x177 /* save key */
-#define KEY_SBEG 0x178 /* shifted beginning key */
-#define KEY_SCANCEL 0x179 /* shifted cancel key */
-#define KEY_SCOMMAND 0x17a /* shifted command key */
-#define KEY_SCOPY 0x17b /* shifted copy key */
-#define KEY_SCREATE 0x17c /* shifted create key */
-#define KEY_SDC 0x17d /* shifted delete char key */
-#define KEY_SDL 0x17e /* shifted delete line key */
-#define KEY_SELECT 0x17f /* select key */
-#define KEY_SEND 0x180 /* shifted end key */
-#define KEY_SEOL 0x181 /* shifted clear line key */
-#define KEY_SEXIT 0x182 /* shifted exit key */
-#define KEY_SFIND 0x183 /* shifted find key */
-#define KEY_SHOME 0x184 /* shifted home key */
-#define KEY_SIC 0x185 /* shifted input key */
-
-#define KEY_SLEFT 0x187 /* shifted left arrow key */
-#define KEY_SMESSAGE 0x188 /* shifted message key */
-#define KEY_SMOVE 0x189 /* shifted move key */
-#define KEY_SNEXT 0x18a /* shifted next key */
-#define KEY_SOPTIONS 0x18b /* shifted options key */
-#define KEY_SPREVIOUS 0x18c /* shifted prev key */
-#define KEY_SPRINT 0x18d /* shifted print key */
-#define KEY_SREDO 0x18e /* shifted redo key */
-#define KEY_SREPLACE 0x18f /* shifted replace key */
-#define KEY_SRIGHT 0x190 /* shifted right arrow */
-#define KEY_SRSUME 0x191 /* shifted resume key */
-#define KEY_SSAVE 0x192 /* shifted save key */
-#define KEY_SSUSPEND 0x193 /* shifted suspend key */
-#define KEY_SUNDO 0x194 /* shifted undo key */
-#define KEY_SUSPEND 0x195 /* suspend key */
-#define KEY_UNDO 0x196 /* undo key */
-
-/* PDCurses-specific key definitions -- PC only */
-
-#define ALT_0 0x197
-#define ALT_1 0x198
-#define ALT_2 0x199
-#define ALT_3 0x19a
-#define ALT_4 0x19b
-#define ALT_5 0x19c
-#define ALT_6 0x19d
-#define ALT_7 0x19e
-#define ALT_8 0x19f
-#define ALT_9 0x1a0
-#define ALT_A 0x1a1
-#define ALT_B 0x1a2
-#define ALT_C 0x1a3
-#define ALT_D 0x1a4
-#define ALT_E 0x1a5
-#define ALT_F 0x1a6
-#define ALT_G 0x1a7
-#define ALT_H 0x1a8
-#define ALT_I 0x1a9
-#define ALT_J 0x1aa
-#define ALT_K 0x1ab
-#define ALT_L 0x1ac
-#define ALT_M 0x1ad
-#define ALT_N 0x1ae
-#define ALT_O 0x1af
-#define ALT_P 0x1b0
-#define ALT_Q 0x1b1
-#define ALT_R 0x1b2
-#define ALT_S 0x1b3
-#define ALT_T 0x1b4
-#define ALT_U 0x1b5
-#define ALT_V 0x1b6
-#define ALT_W 0x1b7
-#define ALT_X 0x1b8
-#define ALT_Y 0x1b9
-#define ALT_Z 0x1ba
-
-#define CTL_LEFT 0x1bb /* Control-Left-Arrow */
-#define CTL_RIGHT 0x1bc
-#define CTL_PGUP 0x1bd
-#define CTL_PGDN 0x1be
-#define CTL_HOME 0x1bf
-#define CTL_END 0x1c0
-
-#define KEY_A1 0x1c1 /* upper left on Virtual keypad */
-#define KEY_A2 0x1c2 /* upper middle on Virt. keypad */
-#define KEY_A3 0x1c3 /* upper right on Vir. keypad */
-#define KEY_B1 0x1c4 /* middle left on Virt. keypad */
-#define KEY_B2 0x1c5 /* center on Virt. keypad */
-#define KEY_B3 0x1c6 /* middle right on Vir. keypad */
-#define KEY_C1 0x1c7 /* lower left on Virt. keypad */
-#define KEY_C2 0x1c8 /* lower middle on Virt. keypad */
-#define KEY_C3 0x1c9 /* lower right on Vir. keypad */
-
-#define PADSLASH 0x1ca /* slash on keypad */
-#define PADENTER 0x1cb /* enter on keypad */
-#define CTL_PADENTER 0x1cc /* ctl-enter on keypad */
-#define ALT_PADENTER 0x1cd /* alt-enter on keypad */
-#define PADSTOP 0x1ce /* stop on keypad */
-#define PADSTAR 0x1cf /* star on keypad */
-#define PADMINUS 0x1d0 /* minus on keypad */
-#define PADPLUS 0x1d1 /* plus on keypad */
-#define CTL_PADSTOP 0x1d2 /* ctl-stop on keypad */
-#define CTL_PADCENTER 0x1d3 /* ctl-enter on keypad */
-#define CTL_PADPLUS 0x1d4 /* ctl-plus on keypad */
-#define CTL_PADMINUS 0x1d5 /* ctl-minus on keypad */
-#define CTL_PADSLASH 0x1d6 /* ctl-slash on keypad */
-#define CTL_PADSTAR 0x1d7 /* ctl-star on keypad */
-#define ALT_PADPLUS 0x1d8 /* alt-plus on keypad */
-#define ALT_PADMINUS 0x1d9 /* alt-minus on keypad */
-#define ALT_PADSLASH 0x1da /* alt-slash on keypad */
-#define ALT_PADSTAR 0x1db /* alt-star on keypad */
-#define ALT_PADSTOP 0x1dc /* alt-stop on keypad */
-#define CTL_INS 0x1dd /* ctl-insert */
-#define ALT_DEL 0x1de /* alt-delete */
-#define ALT_INS 0x1df /* alt-insert */
-#define CTL_UP 0x1e0 /* ctl-up arrow */
-#define CTL_DOWN 0x1e1 /* ctl-down arrow */
-#define CTL_TAB 0x1e2 /* ctl-tab */
-#define ALT_TAB 0x1e3
-#define ALT_MINUS 0x1e4
-#define ALT_EQUAL 0x1e5
-#define ALT_HOME 0x1e6
-#define ALT_PGUP 0x1e7
-#define ALT_PGDN 0x1e8
-#define ALT_END 0x1e9
-#define ALT_UP 0x1ea /* alt-up arrow */
-#define ALT_DOWN 0x1eb /* alt-down arrow */
-#define ALT_RIGHT 0x1ec /* alt-right arrow */
-#define ALT_LEFT 0x1ed /* alt-left arrow */
-#define ALT_ENTER 0x1ee /* alt-enter */
-#define ALT_ESC 0x1ef /* alt-escape */
-#define ALT_BQUOTE 0x1f0 /* alt-back quote */
-#define ALT_LBRACKET 0x1f1 /* alt-left bracket */
-#define ALT_RBRACKET 0x1f2 /* alt-right bracket */
-#define ALT_SEMICOLON 0x1f3 /* alt-semi-colon */
-#define ALT_FQUOTE 0x1f4 /* alt-forward quote */
-#define ALT_COMMA 0x1f5 /* alt-comma */
-#define ALT_STOP 0x1f6 /* alt-stop */
-#define ALT_FSLASH 0x1f7 /* alt-forward slash */
-#define ALT_BKSP 0x1f8 /* alt-backspace */
-#define CTL_BKSP 0x1f9 /* ctl-backspace */
-#define PAD0 0x1fa /* keypad 0 */
-
-#define CTL_PAD0 0x1fb /* ctl-keypad 0 */
-#define CTL_PAD1 0x1fc
-#define CTL_PAD2 0x1fd
-#define CTL_PAD3 0x1fe
-#define CTL_PAD4 0x1ff
-#define CTL_PAD5 0x200
-#define CTL_PAD6 0x201
-#define CTL_PAD7 0x202
-#define CTL_PAD8 0x203
-#define CTL_PAD9 0x204
-
-#define ALT_PAD0 0x205 /* alt-keypad 0 */
-#define ALT_PAD1 0x206
-#define ALT_PAD2 0x207
-#define ALT_PAD3 0x208
-#define ALT_PAD4 0x209
-#define ALT_PAD5 0x20a
-#define ALT_PAD6 0x20b
-#define ALT_PAD7 0x20c
-#define ALT_PAD8 0x20d
-#define ALT_PAD9 0x20e
-
-#define CTL_DEL 0x20f /* clt-delete */
-#define ALT_BSLASH 0x210 /* alt-back slash */
-#define CTL_ENTER 0x211 /* ctl-enter */
-
-#define SHF_PADENTER 0x212 /* shift-enter on keypad */
-#define SHF_PADSLASH 0x213 /* shift-slash on keypad */
-#define SHF_PADSTAR 0x214 /* shift-star on keypad */
-#define SHF_PADPLUS 0x215 /* shift-plus on keypad */
-#define SHF_PADMINUS 0x216 /* shift-minus on keypad */
-#define SHF_UP 0x217 /* shift-up on keypad */
-#define SHF_DOWN 0x218 /* shift-down on keypad */
-#define SHF_IC 0x219 /* shift-insert on keypad */
-#define SHF_DC 0x21a /* shift-delete on keypad */
-
-#define KEY_MOUSE 0x21b /* "mouse" key */
-#define KEY_SHIFT_L 0x21c /* Left-shift */
-#define KEY_SHIFT_R 0x21d /* Right-shift */
-#define KEY_CONTROL_L 0x21e /* Left-control */
-#define KEY_CONTROL_R 0x21f /* Right-control */
-#define KEY_ALT_L 0x220 /* Left-alt */
-#define KEY_ALT_R 0x221 /* Right-alt */
-#define KEY_RESIZE 0x222 /* Window resize */
-#define KEY_SUP 0x223 /* Shifted up arrow */
-#define KEY_SDOWN 0x224 /* Shifted down arrow */
-
-#define KEY_MIN KEY_BREAK /* Minimum curses key value */
-#define KEY_MAX KEY_SDOWN /* Maximum curses key */
-
-#define KEY_F(n) (KEY_F0 + (n))
-
-/*----------------------------------------------------------------------
- *
- * PDCurses Function Declarations
- *
- */
-
-/* Standard */
-
-int addch(const chtype);
-int addchnstr(const chtype *, int);
-int addchstr(const chtype *);
-int addnstr(const char *, int);
-int addstr(const char *);
-int attroff(chtype);
-int attron(chtype);
-int attrset(chtype);
-int attr_get(attr_t *, short *, void *);
-int attr_off(attr_t, void *);
-int attr_on(attr_t, void *);
-int attr_set(attr_t, short, void *);
-int baudrate(void);
-int beep(void);
-int bkgd(chtype);
-void bkgdset(chtype);
-int border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype);
-int box(WINDOW *, chtype, chtype);
-bool can_change_color(void);
-int cbreak(void);
-int chgat(int, attr_t, short, const void *);
-int clearok(WINDOW *, bool);
-int clear(void);
-int clrtobot(void);
-int clrtoeol(void);
-int color_content(short, short *, short *, short *);
-int color_set(short, void *);
-int copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int);
-int curs_set(int);
-int def_prog_mode(void);
-int def_shell_mode(void);
-int delay_output(int);
-int delch(void);
-int deleteln(void);
-void delscreen(SCREEN *);
-int delwin(WINDOW *);
-WINDOW *derwin(WINDOW *, int, int, int, int);
-int doupdate(void);
-WINDOW *dupwin(WINDOW *);
-int echochar(const chtype);
-int echo(void);
-int endwin(void);
-char erasechar(void);
-int erase(void);
-void filter(void);
-int flash(void);
-int flushinp(void);
-chtype getbkgd(WINDOW *);
-int getnstr(char *, int);
-int getstr(char *);
-WINDOW *getwin(FILE *);
-int halfdelay(int);
-bool has_colors(void);
-bool has_ic(void);
-bool has_il(void);
-int hline(chtype, int);
-void idcok(WINDOW *, bool);
-int idlok(WINDOW *, bool);
-void immedok(WINDOW *, bool);
-int inchnstr(chtype *, int);
-int inchstr(chtype *);
-chtype inch(void);
-int init_color(short, short, short, short);
-int init_pair(short, short, short);
-WINDOW *initscr(void);
-int innstr(char *, int);
-int insch(chtype);
-int insdelln(int);
-int insertln(void);
-int insnstr(const char *, int);
-int insstr(const char *);
-int instr(char *);
-int intrflush(WINDOW *, bool);
-bool isendwin(void);
-bool is_linetouched(WINDOW *, int);
-bool is_wintouched(WINDOW *);
-char *keyname(int);
-int keypad(WINDOW *, bool);
-char killchar(void);
-int leaveok(WINDOW *, bool);
-char *longname(void);
-int meta(WINDOW *, bool);
-int move(int, int);
-int mvaddch(int, int, const chtype);
-int mvaddchnstr(int, int, const chtype *, int);
-int mvaddchstr(int, int, const chtype *);
-int mvaddnstr(int, int, const char *, int);
-int mvaddstr(int, int, const char *);
-int mvchgat(int, int, int, attr_t, short, const void *);
-int mvcur(int, int, int, int);
-int mvdelch(int, int);
-int mvderwin(WINDOW *, int, int);
-int mvgetch(int, int);
-int mvgetnstr(int, int, char *, int);
-int mvgetstr(int, int, char *);
-int mvhline(int, int, chtype, int);
-chtype mvinch(int, int);
-int mvinchnstr(int, int, chtype *, int);
-int mvinchstr(int, int, chtype *);
-int mvinnstr(int, int, char *, int);
-int mvinsch(int, int, chtype);
-int mvinsnstr(int, int, const char *, int);
-int mvinsstr(int, int, const char *);
-int mvinstr(int, int, char *);
-int mvprintw(int, int, const char *, ...);
-int mvscanw(int, int, const char *, ...);
-int mvvline(int, int, chtype, int);
-int mvwaddchnstr(WINDOW *, int, int, const chtype *, int);
-int mvwaddchstr(WINDOW *, int, int, const chtype *);
-int mvwaddch(WINDOW *, int, int, const chtype);
-int mvwaddnstr(WINDOW *, int, int, const char *, int);
-int mvwaddstr(WINDOW *, int, int, const char *);
-int mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *);
-int mvwdelch(WINDOW *, int, int);
-int mvwgetch(WINDOW *, int, int);
-int mvwgetnstr(WINDOW *, int, int, char *, int);
-int mvwgetstr(WINDOW *, int, int, char *);
-int mvwhline(WINDOW *, int, int, chtype, int);
-int mvwinchnstr(WINDOW *, int, int, chtype *, int);
-int mvwinchstr(WINDOW *, int, int, chtype *);
-chtype mvwinch(WINDOW *, int, int);
-int mvwinnstr(WINDOW *, int, int, char *, int);
-int mvwinsch(WINDOW *, int, int, chtype);
-int mvwinsnstr(WINDOW *, int, int, const char *, int);
-int mvwinsstr(WINDOW *, int, int, const char *);
-int mvwinstr(WINDOW *, int, int, char *);
-int mvwin(WINDOW *, int, int);
-int mvwprintw(WINDOW *, int, int, const char *, ...);
-int mvwscanw(WINDOW *, int, int, const char *, ...);
-int mvwvline(WINDOW *, int, int, chtype, int);
-int napms(int);
-WINDOW *newpad(int, int);
-SCREEN *newterm(const char *, FILE *, FILE *);
-WINDOW *newwin(int, int, int, int);
-int nl(void);
-int nocbreak(void);
-int nodelay(WINDOW *, bool);
-int noecho(void);
-int nonl(void);
-void noqiflush(void);
-int noraw(void);
-int notimeout(WINDOW *, bool);
-int overlay(const WINDOW *, WINDOW *);
-int overwrite(const WINDOW *, WINDOW *);
-int pair_content(short, short *, short *);
-int pechochar(WINDOW *, chtype);
-int pnoutrefresh(WINDOW *, int, int, int, int, int, int);
-int prefresh(WINDOW *, int, int, int, int, int, int);
-int printw(const char *, ...);
-int putwin(WINDOW *, FILE *);
-void qiflush(void);
-int raw(void);
-int redrawwin(WINDOW *);
-int refresh(void);
-int reset_prog_mode(void);
-int reset_shell_mode(void);
-int resetty(void);
-int ripoffline(int, int (*)(WINDOW *, int));
-int savetty(void);
-int scanw(const char *, ...);
-int scr_dump(const char *);
-int scr_init(const char *);
-int scr_restore(const char *);
-int scr_set(const char *);
-int scrl(int);
-int scroll(WINDOW *);
-int scrollok(WINDOW *, bool);
-SCREEN *set_term(SCREEN *);
-int setscrreg(int, int);
-int slk_attroff(const chtype);
-int slk_attr_off(const attr_t, void *);
-int slk_attron(const chtype);
-int slk_attr_on(const attr_t, void *);
-int slk_attrset(const chtype);
-int slk_attr_set(const attr_t, short, void *);
-int slk_clear(void);
-int slk_color(short);
-int slk_init(int);
-char *slk_label(int);
-int slk_noutrefresh(void);
-int slk_refresh(void);
-int slk_restore(void);
-int slk_set(int, const char *, int);
-int slk_touch(void);
-int standend(void);
-int standout(void);
-int start_color(void);
-WINDOW *subpad(WINDOW *, int, int, int, int);
-WINDOW *subwin(WINDOW *, int, int, int, int);
-int syncok(WINDOW *, bool);
-chtype termattrs(void);
-attr_t term_attrs(void);
-char *termname(void);
-void timeout(int);
-int touchline(WINDOW *, int, int);
-int touchwin(WINDOW *);
-int typeahead(int);
-int untouchwin(WINDOW *);
-void use_env(bool);
-int vidattr(chtype);
-int vid_attr(attr_t, short, void *);
-int vidputs(chtype, int (*)(int));
-int vid_puts(attr_t, short, void *, int (*)(int));
-int vline(chtype, int);
-int vw_printw(WINDOW *, const char *, va_list);
-int vwprintw(WINDOW *, const char *, va_list);
-int vw_scanw(WINDOW *, const char *, va_list);
-int vwscanw(WINDOW *, const char *, va_list);
-int waddchnstr(WINDOW *, const chtype *, int);
-int waddchstr(WINDOW *, const chtype *);
-int waddch(WINDOW *, const chtype);
-int waddnstr(WINDOW *, const char *, int);
-int waddstr(WINDOW *, const char *);
-int wattroff(WINDOW *, chtype);
-int wattron(WINDOW *, chtype);
-int wattrset(WINDOW *, chtype);
-int wattr_get(WINDOW *, attr_t *, short *, void *);
-int wattr_off(WINDOW *, attr_t, void *);
-int wattr_on(WINDOW *, attr_t, void *);
-int wattr_set(WINDOW *, attr_t, short, void *);
-void wbkgdset(WINDOW *, chtype);
-int wbkgd(WINDOW *, chtype);
-int wborder(WINDOW *, chtype, chtype, chtype, chtype,
- chtype, chtype, chtype, chtype);
-int wchgat(WINDOW *, int, attr_t, short, const void *);
-int wclear(WINDOW *);
-int wclrtobot(WINDOW *);
-int wclrtoeol(WINDOW *);
-int wcolor_set(WINDOW *, short, void *);
-void wcursyncup(WINDOW *);
-int wdelch(WINDOW *);
-int wdeleteln(WINDOW *);
-int wechochar(WINDOW *, const chtype);
-int werase(WINDOW *);
-int wgetch(WINDOW *);
-int wgetnstr(WINDOW *, char *, int);
-int wgetstr(WINDOW *, char *);
-int whline(WINDOW *, chtype, int);
-int winchnstr(WINDOW *, chtype *, int);
-int winchstr(WINDOW *, chtype *);
-chtype winch(WINDOW *);
-int winnstr(WINDOW *, char *, int);
-int winsch(WINDOW *, chtype);
-int winsdelln(WINDOW *, int);
-int winsertln(WINDOW *);
-int winsnstr(WINDOW *, const char *, int);
-int winsstr(WINDOW *, const char *);
-int winstr(WINDOW *, char *);
-int wmove(WINDOW *, int, int);
-int wnoutrefresh(WINDOW *);
-int wprintw(WINDOW *, const char *, ...);
-int wredrawln(WINDOW *, int, int);
-int wrefresh(WINDOW *);
-int wscanw(WINDOW *, const char *, ...);
-int wscrl(WINDOW *, int);
-int wsetscrreg(WINDOW *, int, int);
-int wstandend(WINDOW *);
-int wstandout(WINDOW *);
-void wsyncdown(WINDOW *);
-void wsyncup(WINDOW *);
-void wtimeout(WINDOW *, int);
-int wtouchln(WINDOW *, int, int, int);
-int wvline(WINDOW *, chtype, int);
-
-/* Wide-character functions */
-
-#ifdef PDC_WIDE
-int addnwstr(const wchar_t *, int);
-int addwstr(const wchar_t *);
-int add_wch(const cchar_t *);
-int add_wchnstr(const cchar_t *, int);
-int add_wchstr(const cchar_t *);
-int border_set(const cchar_t *, const cchar_t *, const cchar_t *,
- const cchar_t *, const cchar_t *, const cchar_t *,
- const cchar_t *, const cchar_t *);
-int box_set(WINDOW *, const cchar_t *, const cchar_t *);
-int echo_wchar(const cchar_t *);
-int erasewchar(wchar_t *);
-int getbkgrnd(cchar_t *);
-int getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *);
-int getn_wstr(wint_t *, int);
-int get_wch(wint_t *);
-int get_wstr(wint_t *);
-int hline_set(const cchar_t *, int);
-int innwstr(wchar_t *, int);
-int ins_nwstr(const wchar_t *, int);
-int ins_wch(const cchar_t *);
-int ins_wstr(const wchar_t *);
-int inwstr(wchar_t *);
-int in_wch(cchar_t *);
-int in_wchnstr(cchar_t *, int);
-int in_wchstr(cchar_t *);
-char *key_name(wchar_t);
-int killwchar(wchar_t *);
-int mvaddnwstr(int, int, const wchar_t *, int);
-int mvaddwstr(int, int, const wchar_t *);
-int mvadd_wch(int, int, const cchar_t *);
-int mvadd_wchnstr(int, int, const cchar_t *, int);
-int mvadd_wchstr(int, int, const cchar_t *);
-int mvgetn_wstr(int, int, wint_t *, int);
-int mvget_wch(int, int, wint_t *);
-int mvget_wstr(int, int, wint_t *);
-int mvhline_set(int, int, const cchar_t *, int);
-int mvinnwstr(int, int, wchar_t *, int);
-int mvins_nwstr(int, int, const wchar_t *, int);
-int mvins_wch(int, int, const cchar_t *);
-int mvins_wstr(int, int, const wchar_t *);
-int mvinwstr(int, int, wchar_t *);
-int mvin_wch(int, int, cchar_t *);
-int mvin_wchnstr(int, int, cchar_t *, int);
-int mvin_wchstr(int, int, cchar_t *);
-int mvvline_set(int, int, const cchar_t *, int);
-int mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int);
-int mvwaddwstr(WINDOW *, int, int, const wchar_t *);
-int mvwadd_wch(WINDOW *, int, int, const cchar_t *);
-int mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int);
-int mvwadd_wchstr(WINDOW *, int, int, const cchar_t *);
-int mvwgetn_wstr(WINDOW *, int, int, wint_t *, int);
-int mvwget_wch(WINDOW *, int, int, wint_t *);
-int mvwget_wstr(WINDOW *, int, int, wint_t *);
-int mvwhline_set(WINDOW *, int, int, const cchar_t *, int);
-int mvwinnwstr(WINDOW *, int, int, wchar_t *, int);
-int mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int);
-int mvwins_wch(WINDOW *, int, int, const cchar_t *);
-int mvwins_wstr(WINDOW *, int, int, const wchar_t *);
-int mvwin_wch(WINDOW *, int, int, cchar_t *);
-int mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int);
-int mvwin_wchstr(WINDOW *, int, int, cchar_t *);
-int mvwinwstr(WINDOW *, int, int, wchar_t *);
-int mvwvline_set(WINDOW *, int, int, const cchar_t *, int);
-int pecho_wchar(WINDOW *, const cchar_t*);
-int setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*);
-int slk_wset(int, const wchar_t *, int);
-int unget_wch(const wchar_t);
-int vline_set(const cchar_t *, int);
-int waddnwstr(WINDOW *, const wchar_t *, int);
-int waddwstr(WINDOW *, const wchar_t *);
-int wadd_wch(WINDOW *, const cchar_t *);
-int wadd_wchnstr(WINDOW *, const cchar_t *, int);
-int wadd_wchstr(WINDOW *, const cchar_t *);
-int wbkgrnd(WINDOW *, const cchar_t *);
-void wbkgrndset(WINDOW *, const cchar_t *);
-int wborder_set(WINDOW *, const cchar_t *, const cchar_t *,
- const cchar_t *, const cchar_t *, const cchar_t *,
- const cchar_t *, const cchar_t *, const cchar_t *);
-int wecho_wchar(WINDOW *, const cchar_t *);
-int wgetbkgrnd(WINDOW *, cchar_t *);
-int wgetn_wstr(WINDOW *, wint_t *, int);
-int wget_wch(WINDOW *, wint_t *);
-int wget_wstr(WINDOW *, wint_t *);
-int whline_set(WINDOW *, const cchar_t *, int);
-int winnwstr(WINDOW *, wchar_t *, int);
-int wins_nwstr(WINDOW *, const wchar_t *, int);
-int wins_wch(WINDOW *, const cchar_t *);
-int wins_wstr(WINDOW *, const wchar_t *);
-int winwstr(WINDOW *, wchar_t *);
-int win_wch(WINDOW *, cchar_t *);
-int win_wchnstr(WINDOW *, cchar_t *, int);
-int win_wchstr(WINDOW *, cchar_t *);
-wchar_t *wunctrl(cchar_t *);
-int wvline_set(WINDOW *, const cchar_t *, int);
-#endif
-
-/* Quasi-standard */
-
-chtype getattrs(WINDOW *);
-int getbegx(WINDOW *);
-int getbegy(WINDOW *);
-int getmaxx(WINDOW *);
-int getmaxy(WINDOW *);
-int getparx(WINDOW *);
-int getpary(WINDOW *);
-int getcurx(WINDOW *);
-int getcury(WINDOW *);
-void traceoff(void);
-void traceon(void);
-char *unctrl(chtype);
-
-int crmode(void);
-int nocrmode(void);
-int draino(int);
-int resetterm(void);
-int fixterm(void);
-int saveterm(void);
-int setsyx(int, int);
-
-int mouse_set(unsigned long);
-int mouse_on(unsigned long);
-int mouse_off(unsigned long);
-int request_mouse_pos(void);
-int map_button(unsigned long);
-void wmouse_position(WINDOW *, int *, int *);
-unsigned long getmouse(void);
-unsigned long getbmap(void);
-
-/* ncurses */
-
-int assume_default_colors(int, int);
-const char *curses_version(void);
-bool has_key(int);
-int use_default_colors(void);
-int wresize(WINDOW *, int, int);
-
-int mouseinterval(int);
-mmask_t mousemask(mmask_t, mmask_t *);
-bool mouse_trafo(int *, int *, bool);
-int nc_getmouse(MEVENT *);
-int ungetmouse(MEVENT *);
-bool wenclose(const WINDOW *, int, int);
-bool wmouse_trafo(const WINDOW *, int *, int *, bool);
-
-/* PDCurses */
-
-int addrawch(chtype);
-int insrawch(chtype);
-bool is_termresized(void);
-int mvaddrawch(int, int, chtype);
-int mvdeleteln(int, int);
-int mvinsertln(int, int);
-int mvinsrawch(int, int, chtype);
-int mvwaddrawch(WINDOW *, int, int, chtype);
-int mvwdeleteln(WINDOW *, int, int);
-int mvwinsertln(WINDOW *, int, int);
-int mvwinsrawch(WINDOW *, int, int, chtype);
-int raw_output(bool);
-int resize_term(int, int);
-WINDOW *resize_window(WINDOW *, int, int);
-int waddrawch(WINDOW *, chtype);
-int winsrawch(WINDOW *, chtype);
-char wordchar(void);
-
-#ifdef PDC_WIDE
-wchar_t *slk_wlabel(int);
-#endif
-
-void PDC_debug(const char *, ...);
-int PDC_ungetch(int);
-int PDC_set_blink(bool);
-int PDC_set_line_color(short);
-void PDC_set_title(const char *);
-
-int PDC_clearclipboard(void);
-int PDC_freeclipboard(char *);
-int PDC_getclipboard(char **, long *);
-int PDC_setclipboard(const char *, long);
-
-unsigned long PDC_get_input_fd(void);
-unsigned long PDC_get_key_modifiers(void);
-int PDC_return_key_modifiers(bool);
-int PDC_save_key_modifiers(bool);
-
-#ifdef XCURSES
-WINDOW *Xinitscr(int, char **);
-void XCursesExit(void);
-int sb_init(void);
-int sb_set_horz(int, int, int);
-int sb_set_vert(int, int, int);
-int sb_get_horz(int *, int *, int *);
-int sb_get_vert(int *, int *, int *);
-int sb_refresh(void);
-#endif
-
-/*** Functions defined as macros ***/
-
-/* getch() and ungetch() conflict with some DOS libraries */
-
-#define getch() wgetch(stdscr)
-#define ungetch(ch) PDC_ungetch(ch)
-
-#define COLOR_PAIR(n) (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR)
-#define PAIR_NUMBER(n) (((n) & A_COLOR) >> PDC_COLOR_SHIFT)
-
-/* These will _only_ work as macros */
-
-#define getbegyx(w, y, x) (y = getbegy(w), x = getbegx(w))
-#define getmaxyx(w, y, x) (y = getmaxy(w), x = getmaxx(w))
-#define getparyx(w, y, x) (y = getpary(w), x = getparx(w))
-#define getyx(w, y, x) (y = getcury(w), x = getcurx(w))
-
-#define getsyx(y, x) { if (curscr->_leaveit) (y)=(x)=-1; \
- else getyx(curscr,(y),(x)); }
-
-#ifdef NCURSES_MOUSE_VERSION
-# define getmouse(x) nc_getmouse(x)
-#endif
-
-/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */
-
-#define PDC_CLIP_SUCCESS 0
-#define PDC_CLIP_ACCESS_ERROR 1
-#define PDC_CLIP_EMPTY 2
-#define PDC_CLIP_MEMORY_ERROR 3
-
-/* PDCurses key modifier masks */
-
-#define PDC_KEY_MODIFIER_SHIFT 1
-#define PDC_KEY_MODIFIER_CONTROL 2
-#define PDC_KEY_MODIFIER_ALT 4
-#define PDC_KEY_MODIFIER_NUMLOCK 8
-
-#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS)
-# undef bool
-}
-#endif
-
-#endif /* __PDCURSES__ */
diff --git a/sam/win32/zconf.h b/sam/win32/zconf.h
deleted file mode 100644
index 03a9431..0000000
--- a/sam/win32/zconf.h
+++ /dev/null
@@ -1,332 +0,0 @@
-/* zconf.h -- configuration of the zlib compression library
- * Copyright (C) 1995-2005 Jean-loup Gailly.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* @(#) $Id$ */
-
-#ifndef ZCONF_H
-#define ZCONF_H
-
-/*
- * If you *really* need a unique prefix for all types and library functions,
- * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
- */
-#ifdef Z_PREFIX
-# define deflateInit_ z_deflateInit_
-# define deflate z_deflate
-# define deflateEnd z_deflateEnd
-# define inflateInit_ z_inflateInit_
-# define inflate z_inflate
-# define inflateEnd z_inflateEnd
-# define deflateInit2_ z_deflateInit2_
-# define deflateSetDictionary z_deflateSetDictionary
-# define deflateCopy z_deflateCopy
-# define deflateReset z_deflateReset
-# define deflateParams z_deflateParams
-# define deflateBound z_deflateBound
-# define deflatePrime z_deflatePrime
-# define inflateInit2_ z_inflateInit2_
-# define inflateSetDictionary z_inflateSetDictionary
-# define inflateSync z_inflateSync
-# define inflateSyncPoint z_inflateSyncPoint
-# define inflateCopy z_inflateCopy
-# define inflateReset z_inflateReset
-# define inflateBack z_inflateBack
-# define inflateBackEnd z_inflateBackEnd
-# define compress z_compress
-# define compress2 z_compress2
-# define compressBound z_compressBound
-# define uncompress z_uncompress
-# define adler32 z_adler32
-# define crc32 z_crc32
-# define get_crc_table z_get_crc_table
-# define zError z_zError
-
-# define alloc_func z_alloc_func
-# define free_func z_free_func
-# define in_func z_in_func
-# define out_func z_out_func
-# define Byte z_Byte
-# define uInt z_uInt
-# define uLong z_uLong
-# define Bytef z_Bytef
-# define charf z_charf
-# define intf z_intf
-# define uIntf z_uIntf
-# define uLongf z_uLongf
-# define voidpf z_voidpf
-# define voidp z_voidp
-#endif
-
-#if defined(__MSDOS__) && !defined(MSDOS)
-# define MSDOS
-#endif
-#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
-# define OS2
-#endif
-#if defined(_WINDOWS) && !defined(WINDOWS)
-# define WINDOWS
-#endif
-#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
-# ifndef WIN32
-# define WIN32
-# endif
-#endif
-#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
-# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
-# ifndef SYS16BIT
-# define SYS16BIT
-# endif
-# endif
-#endif
-
-/*
- * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
- * than 64k bytes at a time (needed on systems with 16-bit int).
- */
-#ifdef SYS16BIT
-# define MAXSEG_64K
-#endif
-#ifdef MSDOS
-# define UNALIGNED_OK
-#endif
-
-#ifdef __STDC_VERSION__
-# ifndef STDC
-# define STDC
-# endif
-# if __STDC_VERSION__ >= 199901L
-# ifndef STDC99
-# define STDC99
-# endif
-# endif
-#endif
-#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
-# define STDC
-#endif
-#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
-# define STDC
-#endif
-#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
-# define STDC
-#endif
-#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
-# define STDC
-#endif
-
-#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */
-# define STDC
-#endif
-
-#ifndef STDC
-# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
-# define const /* note: need a more gentle solution here */
-# endif
-#endif
-
-/* Some Mac compilers merge all .h files incorrectly: */
-#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
-# define NO_DUMMY_DECL
-#endif
-
-/* Maximum value for memLevel in deflateInit2 */
-#ifndef MAX_MEM_LEVEL
-# ifdef MAXSEG_64K
-# define MAX_MEM_LEVEL 8
-# else
-# define MAX_MEM_LEVEL 9
-# endif
-#endif
-
-/* Maximum value for windowBits in deflateInit2 and inflateInit2.
- * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
- * created by gzip. (Files created by minigzip can still be extracted by
- * gzip.)
- */
-#ifndef MAX_WBITS
-# define MAX_WBITS 15 /* 32K LZ77 window */
-#endif
-
-/* The memory requirements for deflate are (in bytes):
- (1 << (windowBits+2)) + (1 << (memLevel+9))
- that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values)
- plus a few kilobytes for small objects. For example, if you want to reduce
- the default memory requirements from 256K to 128K, compile with
- make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
- Of course this will generally degrade compression (there's no free lunch).
-
- The memory requirements for inflate are (in bytes) 1 << windowBits
- that is, 32K for windowBits=15 (default value) plus a few kilobytes
- for small objects.
-*/
-
- /* Type declarations */
-
-#ifndef OF /* function prototypes */
-# ifdef STDC
-# define OF(args) args
-# else
-# define OF(args) ()
-# endif
-#endif
-
-/* The following definitions for FAR are needed only for MSDOS mixed
- * model programming (small or medium model with some far allocations).
- * This was tested only with MSC; for other MSDOS compilers you may have
- * to define NO_MEMCPY in zutil.h. If you don't need the mixed model,
- * just define FAR to be empty.
- */
-#ifdef SYS16BIT
-# if defined(M_I86SM) || defined(M_I86MM)
- /* MSC small or medium model */
-# define SMALL_MEDIUM
-# ifdef _MSC_VER
-# define FAR _far
-# else
-# define FAR far
-# endif
-# endif
-# if (defined(__SMALL__) || defined(__MEDIUM__))
- /* Turbo C small or medium model */
-# define SMALL_MEDIUM
-# ifdef __BORLANDC__
-# define FAR _far
-# else
-# define FAR far
-# endif
-# endif
-#endif
-
-#if defined(WINDOWS) || defined(WIN32)
- /* If building or using zlib as a DLL, define ZLIB_DLL.
- * This is not mandatory, but it offers a little performance increase.
- */
-# ifdef ZLIB_DLL
-# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
-# ifdef ZLIB_INTERNAL
-# define ZEXTERN extern __declspec(dllexport)
-# else
-# define ZEXTERN extern __declspec(dllimport)
-# endif
-# endif
-# endif /* ZLIB_DLL */
- /* If building or using zlib with the WINAPI/WINAPIV calling convention,
- * define ZLIB_WINAPI.
- * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
- */
-# ifdef ZLIB_WINAPI
-# ifdef FAR
-# undef FAR
-# endif
-# include <windows.h>
- /* No need for _export, use ZLIB.DEF instead. */
- /* For complete Windows compatibility, use WINAPI, not __stdcall. */
-# define ZEXPORT WINAPI
-# ifdef WIN32
-# define ZEXPORTVA WINAPIV
-# else
-# define ZEXPORTVA FAR CDECL
-# endif
-# endif
-#endif
-
-#if defined (__BEOS__)
-# ifdef ZLIB_DLL
-# ifdef ZLIB_INTERNAL
-# define ZEXPORT __declspec(dllexport)
-# define ZEXPORTVA __declspec(dllexport)
-# else
-# define ZEXPORT __declspec(dllimport)
-# define ZEXPORTVA __declspec(dllimport)
-# endif
-# endif
-#endif
-
-#ifndef ZEXTERN
-# define ZEXTERN extern
-#endif
-#ifndef ZEXPORT
-# define ZEXPORT
-#endif
-#ifndef ZEXPORTVA
-# define ZEXPORTVA
-#endif
-
-#ifndef FAR
-# define FAR
-#endif
-
-#if !defined(__MACTYPES__)
-typedef unsigned char Byte; /* 8 bits */
-#endif
-typedef unsigned int uInt; /* 16 bits or more */
-typedef unsigned long uLong; /* 32 bits or more */
-
-#ifdef SMALL_MEDIUM
- /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
-# define Bytef Byte FAR
-#else
- typedef Byte FAR Bytef;
-#endif
-typedef char FAR charf;
-typedef int FAR intf;
-typedef uInt FAR uIntf;
-typedef uLong FAR uLongf;
-
-#ifdef STDC
- typedef void const *voidpc;
- typedef void FAR *voidpf;
- typedef void *voidp;
-#else
- typedef Byte const *voidpc;
- typedef Byte FAR *voidpf;
- typedef Byte *voidp;
-#endif
-
-#if 0 /* HAVE_UNISTD_H -- this line is updated by ./configure */
-# include <sys/types.h> /* for off_t */
-# include <unistd.h> /* for SEEK_* and off_t */
-# ifdef VMS
-# include <unixio.h> /* for off_t */
-# endif
-# define z_off_t off_t
-#endif
-#ifndef SEEK_SET
-# define SEEK_SET 0 /* Seek from beginning of file. */
-# define SEEK_CUR 1 /* Seek from current position. */
-# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */
-#endif
-#ifndef z_off_t
-# define z_off_t long
-#endif
-
-#if defined(__OS400__)
-# define NO_vsnprintf
-#endif
-
-#if defined(__MVS__)
-# define NO_vsnprintf
-# ifdef FAR
-# undef FAR
-# endif
-#endif
-
-/* MVS linker does not support external names larger than 8 bytes */
-#if defined(__MVS__)
-# pragma map(deflateInit_,"DEIN")
-# pragma map(deflateInit2_,"DEIN2")
-# pragma map(deflateEnd,"DEEND")
-# pragma map(deflateBound,"DEBND")
-# pragma map(inflateInit_,"ININ")
-# pragma map(inflateInit2_,"ININ2")
-# pragma map(inflateEnd,"INEND")
-# pragma map(inflateSync,"INSY")
-# pragma map(inflateSetDictionary,"INSEDI")
-# pragma map(compressBound,"CMBND")
-# pragma map(inflate_table,"INTABL")
-# pragma map(inflate_fast,"INFA")
-# pragma map(inflate_copyright,"INCOPY")
-#endif
-
-#endif /* ZCONF_H */
diff --git a/sam/win32/zlib.h b/sam/win32/zlib.h
deleted file mode 100644
index 0228179..0000000
--- a/sam/win32/zlib.h
+++ /dev/null
@@ -1,1357 +0,0 @@
-/* zlib.h -- interface of the 'zlib' general purpose compression library
- version 1.2.3, July 18th, 2005
-
- Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
-
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any damages
- arising from the use of this software.
-
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it
- freely, subject to the following restrictions:
-
- 1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
-
- Jean-loup Gailly Mark Adler
- ***@gzip.org ***@alumni.caltech.edu
-
-
- The data format used by the zlib library is described by RFCs (Request for
- Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
- (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
-*/
-
-#ifndef ZLIB_H
-#define ZLIB_H
-
-#include "zconf.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ZLIB_VERSION "1.2.3"
-#define ZLIB_VERNUM 0x1230
-
-/*
- The 'zlib' compression library provides in-memory compression and
- decompression functions, including integrity checks of the uncompressed
- data. This version of the library supports only one compression method
- (deflation) but other algorithms will be added later and will have the same
- stream interface.
-
- Compression can be done in a single step if the buffers are large
- enough (for example if an input file is mmap'ed), or can be done by
- repeated calls of the compression function. In the latter case, the
- application must provide more input and/or consume the output
- (providing more output space) before each call.
-
- The compressed data format used by default by the in-memory functions is
- the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
- around a deflate stream, which is itself documented in RFC 1951.
-
- The library also supports reading and writing files in gzip (.gz) format
- with an interface similar to that of stdio using the functions that start
- with "gz". The gzip format is different from the zlib format. gzip is a
- gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
-
- This library can optionally read and write gzip streams in memory as well.
-
- The zlib format was designed to be compact and fast for use in memory
- and on communications channels. The gzip format was designed for single-
- file compression on file systems, has a larger header than zlib to maintain
- directory information, and uses a different, slower check method than zlib.
-
- The library does not install any signal handler. The decoder checks
- the consistency of the compressed data, so the library should never
- crash even in case of corrupted input.
-*/
-
-typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
-typedef void (*free_func) OF((voidpf opaque, voidpf address));
-
-struct internal_state;
-
-typedef struct z_stream_s {
- Bytef *next_in; /* next input byte */
- uInt avail_in; /* number of bytes available at next_in */
- uLong total_in; /* total nb of input bytes read so far */
-
- Bytef *next_out; /* next output byte should be put there */
- uInt avail_out; /* remaining free space at next_out */
- uLong total_out; /* total nb of bytes output so far */
-
- char *msg; /* last error message, NULL if no error */
- struct internal_state FAR *state; /* not visible by applications */
-
- alloc_func zalloc; /* used to allocate the internal state */
- free_func zfree; /* used to free the internal state */
- voidpf opaque; /* private data object passed to zalloc and zfree */
-
- int data_type; /* best guess about the data type: binary or text */
- uLong adler; /* adler32 value of the uncompressed data */
- uLong reserved; /* reserved for future use */
-} z_stream;
-
-typedef z_stream FAR *z_streamp;
-
-/*
- gzip header information passed to and from zlib routines. See RFC 1952
- for more details on the meanings of these fields.
-*/
-typedef struct gz_header_s {
- int text; /* true if compressed data believed to be text */
- uLong time; /* modification time */
- int xflags; /* extra flags (not used when writing a gzip file) */
- int os; /* operating system */
- Bytef *extra; /* pointer to extra field or Z_NULL if none */
- uInt extra_len; /* extra field length (valid if extra != Z_NULL) */
- uInt extra_max; /* space at extra (only when reading header) */
- Bytef *name; /* pointer to zero-terminated file name or Z_NULL */
- uInt name_max; /* space at name (only when reading header) */
- Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */
- uInt comm_max; /* space at comment (only when reading header) */
- int hcrc; /* true if there was or will be a header crc */
- int done; /* true when done reading gzip header (not used
- when writing a gzip file) */
-} gz_header;
-
-typedef gz_header FAR *gz_headerp;
-
-/*
- The application must update next_in and avail_in when avail_in has
- dropped to zero. It must update next_out and avail_out when avail_out
- has dropped to zero. The application must initialize zalloc, zfree and
- opaque before calling the init function. All other fields are set by the
- compression library and must not be updated by the application.
-
- The opaque value provided by the application will be passed as the first
- parameter for calls of zalloc and zfree. This can be useful for custom
- memory management. The compression library attaches no meaning to the
- opaque value.
-
- zalloc must return Z_NULL if there is not enough memory for the object.
- If zlib is used in a multi-threaded application, zalloc and zfree must be
- thread safe.
-
- On 16-bit systems, the functions zalloc and zfree must be able to allocate
- exactly 65536 bytes, but will not be required to allocate more than this
- if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
- pointers returned by zalloc for objects of exactly 65536 bytes *must*
- have their offset normalized to zero. The default allocation function
- provided by this library ensures this (see zutil.c). To reduce memory
- requirements and avoid any allocation of 64K objects, at the expense of
- compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
-
- The fields total_in and total_out can be used for statistics or
- progress reports. After compression, total_in holds the total size of
- the uncompressed data and may be saved for use in the decompressor
- (particularly if the decompressor wants to decompress everything in
- a single step).
-*/
-
- /* constants */
-
-#define Z_NO_FLUSH 0
-#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
-#define Z_SYNC_FLUSH 2
-#define Z_FULL_FLUSH 3
-#define Z_FINISH 4
-#define Z_BLOCK 5
-/* Allowed flush values; see deflate() and inflate() below for details */
-
-#define Z_OK 0
-#define Z_STREAM_END 1
-#define Z_NEED_DICT 2
-#define Z_ERRNO (-1)
-#define Z_STREAM_ERROR (-2)
-#define Z_DATA_ERROR (-3)
-#define Z_MEM_ERROR (-4)
-#define Z_BUF_ERROR (-5)
-#define Z_VERSION_ERROR (-6)
-/* Return codes for the compression/decompression functions. Negative
- * values are errors, positive values are used for special but normal events.
- */
-
-#define Z_NO_COMPRESSION 0
-#define Z_BEST_SPEED 1
-#define Z_BEST_COMPRESSION 9
-#define Z_DEFAULT_COMPRESSION (-1)
-/* compression levels */
-
-#define Z_FILTERED 1
-#define Z_HUFFMAN_ONLY 2
-#define Z_RLE 3
-#define Z_FIXED 4
-#define Z_DEFAULT_STRATEGY 0
-/* compression strategy; see deflateInit2() below for details */
-
-#define Z_BINARY 0
-#define Z_TEXT 1
-#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */
-#define Z_UNKNOWN 2
-/* Possible values of the data_type field (though see inflate()) */
-
-#define Z_DEFLATED 8
-/* The deflate compression method (the only one supported in this version) */
-
-#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */
-
-#define zlib_version zlibVersion()
-/* for compatibility with versions < 1.0.2 */
-
- /* basic functions */
-
-ZEXTERN const char * ZEXPORT zlibVersion OF((void));
-/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
- If the first character differs, the library code actually used is
- not compatible with the zlib.h header file used by the application.
- This check is automatically made by deflateInit and inflateInit.
- */
-
-/*
-ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
-
- Initializes the internal stream state for compression. The fields
- zalloc, zfree and opaque must be initialized before by the caller.
- If zalloc and zfree are set to Z_NULL, deflateInit updates them to
- use default allocation functions.
-
- The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
- 1 gives best speed, 9 gives best compression, 0 gives no compression at
- all (the input data is simply copied a block at a time).
- Z_DEFAULT_COMPRESSION requests a default compromise between speed and
- compression (currently equivalent to level 6).
-
- deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_STREAM_ERROR if level is not a valid compression level,
- Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
- with the version assumed by the caller (ZLIB_VERSION).
- msg is set to null if there is no error message. deflateInit does not
- perform any compression: this will be done by deflate().
-*/
-
-
-ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
-/*
- deflate compresses as much data as possible, and stops when the input
- buffer becomes empty or the output buffer becomes full. It may introduce some
- output latency (reading input without producing any output) except when
- forced to flush.
-
- The detailed semantics are as follows. deflate performs one or both of the
- following actions:
-
- - Compress more input starting at next_in and update next_in and avail_in
- accordingly. If not all input can be processed (because there is not
- enough room in the output buffer), next_in and avail_in are updated and
- processing will resume at this point for the next call of deflate().
-
- - Provide more output starting at next_out and update next_out and avail_out
- accordingly. This action is forced if the parameter flush is non zero.
- Forcing flush frequently degrades the compression ratio, so this parameter
- should be set only when necessary (in interactive applications).
- Some output may be provided even if flush is not set.
-
- Before the call of deflate(), the application should ensure that at least
- one of the actions is possible, by providing more input and/or consuming
- more output, and updating avail_in or avail_out accordingly; avail_out
- should never be zero before the call. The application can consume the
- compressed output when it wants, for example when the output buffer is full
- (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
- and with zero avail_out, it must be called again after making room in the
- output buffer because there might be more output pending.
-
- Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
- decide how much data to accumualte before producing output, in order to
- maximize compression.
-
- If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
- flushed to the output buffer and the output is aligned on a byte boundary, so
- that the decompressor can get all input data available so far. (In particular
- avail_in is zero after the call if enough output space has been provided
- before the call.) Flushing may degrade compression for some compression
- algorithms and so it should be used only when necessary.
-
- If flush is set to Z_FULL_FLUSH, all output is flushed as with
- Z_SYNC_FLUSH, and the compression state is reset so that decompression can
- restart from this point if previous compressed data has been damaged or if
- random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
- compression.
-
- If deflate returns with avail_out == 0, this function must be called again
- with the same value of the flush parameter and more output space (updated
- avail_out), until the flush is complete (deflate returns with non-zero
- avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
- avail_out is greater than six to avoid repeated flush markers due to
- avail_out == 0 on return.
-
- If the parameter flush is set to Z_FINISH, pending input is processed,
- pending output is flushed and deflate returns with Z_STREAM_END if there
- was enough output space; if deflate returns with Z_OK, this function must be
- called again with Z_FINISH and more output space (updated avail_out) but no
- more input data, until it returns with Z_STREAM_END or an error. After
- deflate has returned Z_STREAM_END, the only possible operations on the
- stream are deflateReset or deflateEnd.
-
- Z_FINISH can be used immediately after deflateInit if all the compression
- is to be done in a single step. In this case, avail_out must be at least
- the value returned by deflateBound (see below). If deflate does not return
- Z_STREAM_END, then it must be called again as described above.
-
- deflate() sets strm->adler to the adler32 checksum of all input read
- so far (that is, total_in bytes).
-
- deflate() may update strm->data_type if it can make a good guess about
- the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
- binary. This field is only for information purposes and does not affect
- the compression algorithm in any manner.
-
- deflate() returns Z_OK if some progress has been made (more input
- processed or more output produced), Z_STREAM_END if all input has been
- consumed and all output has been produced (only when flush is set to
- Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
- if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
- (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
- fatal, and deflate() can be called again with more input and more output
- space to continue compressing.
-*/
-
-
-ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
-/*
- All dynamically allocated data structures for this stream are freed.
- This function discards any unprocessed input and does not flush any
- pending output.
-
- deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
- stream state was inconsistent, Z_DATA_ERROR if the stream was freed
- prematurely (some input or output was discarded). In the error case,
- msg may be set but then points to a static string (which must not be
- deallocated).
-*/
-
-
-/*
-ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
-
- Initializes the internal stream state for decompression. The fields
- next_in, avail_in, zalloc, zfree and opaque must be initialized before by
- the caller. If next_in is not Z_NULL and avail_in is large enough (the exact
- value depends on the compression method), inflateInit determines the
- compression method from the zlib header and allocates all data structures
- accordingly; otherwise the allocation will be deferred to the first call of
- inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to
- use default allocation functions.
-
- inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
- version assumed by the caller. msg is set to null if there is no error
- message. inflateInit does not perform any decompression apart from reading
- the zlib header if present: this will be done by inflate(). (So next_in and
- avail_in may be modified, but next_out and avail_out are unchanged.)
-*/
-
-
-ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
-/*
- inflate decompresses as much data as possible, and stops when the input
- buffer becomes empty or the output buffer becomes full. It may introduce
- some output latency (reading input without producing any output) except when
- forced to flush.
-
- The detailed semantics are as follows. inflate performs one or both of the
- following actions:
-
- - Decompress more input starting at next_in and update next_in and avail_in
- accordingly. If not all input can be processed (because there is not
- enough room in the output buffer), next_in is updated and processing
- will resume at this point for the next call of inflate().
-
- - Provide more output starting at next_out and update next_out and avail_out
- accordingly. inflate() provides as much output as possible, until there
- is no more input data or no more space in the output buffer (see below
- about the flush parameter).
-
- Before the call of inflate(), the application should ensure that at least
- one of the actions is possible, by providing more input and/or consuming
- more output, and updating the next_* and avail_* values accordingly.
- The application can consume the uncompressed output when it wants, for
- example when the output buffer is full (avail_out == 0), or after each
- call of inflate(). If inflate returns Z_OK and with zero avail_out, it
- must be called again after making room in the output buffer because there
- might be more output pending.
-
- The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
- Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
- output as possible to the output buffer. Z_BLOCK requests that inflate() stop
- if and when it gets to the next deflate block boundary. When decoding the
- zlib or gzip format, this will cause inflate() to return immediately after
- the header and before the first block. When doing a raw inflate, inflate()
- will go ahead and process the first block, and will return when it gets to
- the end of that block, or when it runs out of data.
-
- The Z_BLOCK option assists in appending to or combining deflate streams.
- Also to assist in this, on return inflate() will set strm->data_type to the
- number of unused bits in the last byte taken from strm->next_in, plus 64
- if inflate() is currently decoding the last block in the deflate stream,
- plus 128 if inflate() returned immediately after decoding an end-of-block
- code or decoding the complete header up to just before the first byte of the
- deflate stream. The end-of-block will not be indicated until all of the
- uncompressed data from that block has been written to strm->next_out. The
- number of unused bits may in general be greater than seven, except when
- bit 7 of data_type is set, in which case the number of unused bits will be
- less than eight.
-
- inflate() should normally be called until it returns Z_STREAM_END or an
- error. However if all decompression is to be performed in a single step
- (a single call of inflate), the parameter flush should be set to
- Z_FINISH. In this case all pending input is processed and all pending
- output is flushed; avail_out must be large enough to hold all the
- uncompressed data. (The size of the uncompressed data may have been saved
- by the compressor for this purpose.) The next operation on this stream must
- be inflateEnd to deallocate the decompression state. The use of Z_FINISH
- is never required, but can be used to inform inflate that a faster approach
- may be used for the single inflate() call.
-
- In this implementation, inflate() always flushes as much output as
- possible to the output buffer, and always uses the faster approach on the
- first call. So the only effect of the flush parameter in this implementation
- is on the return value of inflate(), as noted below, or when it returns early
- because Z_BLOCK is used.
-
- If a preset dictionary is needed after this call (see inflateSetDictionary
- below), inflate sets strm->adler to the adler32 checksum of the dictionary
- chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
- strm->adler to the adler32 checksum of all output produced so far (that is,
- total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
- below. At the end of the stream, inflate() checks that its computed adler32
- checksum is equal to that saved by the compressor and returns Z_STREAM_END
- only if the checksum is correct.
-
- inflate() will decompress and check either zlib-wrapped or gzip-wrapped
- deflate data. The header type is detected automatically. Any information
- contained in the gzip header is not retained, so applications that need that
- information should instead use raw inflate, see inflateInit2() below, or
- inflateBack() and perform their own processing of the gzip header and
- trailer.
-
- inflate() returns Z_OK if some progress has been made (more input processed
- or more output produced), Z_STREAM_END if the end of the compressed data has
- been reached and all uncompressed output has been produced, Z_NEED_DICT if a
- preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
- corrupted (input stream not conforming to the zlib format or incorrect check
- value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
- if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
- Z_BUF_ERROR if no progress is possible or if there was not enough room in the
- output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
- inflate() can be called again with more input and more output space to
- continue decompressing. If Z_DATA_ERROR is returned, the application may then
- call inflateSync() to look for a good compression block if a partial recovery
- of the data is desired.
-*/
-
-
-ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
-/*
- All dynamically allocated data structures for this stream are freed.
- This function discards any unprocessed input and does not flush any
- pending output.
-
- inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
- was inconsistent. In the error case, msg may be set but then points to a
- static string (which must not be deallocated).
-*/
-
- /* Advanced functions */
-
-/*
- The following functions are needed only in some special applications.
-*/
-
-/*
-ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
- int level,
- int method,
- int windowBits,
- int memLevel,
- int strategy));
-
- This is another version of deflateInit with more compression options. The
- fields next_in, zalloc, zfree and opaque must be initialized before by
- the caller.
-
- The method parameter is the compression method. It must be Z_DEFLATED in
- this version of the library.
-
- The windowBits parameter is the base two logarithm of the window size
- (the size of the history buffer). It should be in the range 8..15 for this
- version of the library. Larger values of this parameter result in better
- compression at the expense of memory usage. The default value is 15 if
- deflateInit is used instead.
-
- windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
- determines the window size. deflate() will then generate raw deflate data
- with no zlib header or trailer, and will not compute an adler32 check value.
-
- windowBits can also be greater than 15 for optional gzip encoding. Add
- 16 to windowBits to write a simple gzip header and trailer around the
- compressed data instead of a zlib wrapper. The gzip header will have no
- file name, no extra data, no comment, no modification time (set to zero),
- no header crc, and the operating system will be set to 255 (unknown). If a
- gzip stream is being written, strm->adler is a crc32 instead of an adler32.
-
- The memLevel parameter specifies how much memory should be allocated
- for the internal compression state. memLevel=1 uses minimum memory but
- is slow and reduces compression ratio; memLevel=9 uses maximum memory
- for optimal speed. The default value is 8. See zconf.h for total memory
- usage as a function of windowBits and memLevel.
-
- The strategy parameter is used to tune the compression algorithm. Use the
- value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
- filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
- string match), or Z_RLE to limit match distances to one (run-length
- encoding). Filtered data consists mostly of small values with a somewhat
- random distribution. In this case, the compression algorithm is tuned to
- compress them better. The effect of Z_FILTERED is to force more Huffman
- coding and less string matching; it is somewhat intermediate between
- Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as
- Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy
- parameter only affects the compression ratio but not the correctness of the
- compressed output even if it is not set appropriately. Z_FIXED prevents the
- use of dynamic Huffman codes, allowing for a simpler decoder for special
- applications.
-
- deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
- method). msg is set to null if there is no error message. deflateInit2 does
- not perform any compression: this will be done by deflate().
-*/
-
-ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
- const Bytef *dictionary,
- uInt dictLength));
-/*
- Initializes the compression dictionary from the given byte sequence
- without producing any compressed output. This function must be called
- immediately after deflateInit, deflateInit2 or deflateReset, before any
- call of deflate. The compressor and decompressor must use exactly the same
- dictionary (see inflateSetDictionary).
-
- The dictionary should consist of strings (byte sequences) that are likely
- to be encountered later in the data to be compressed, with the most commonly
- used strings preferably put towards the end of the dictionary. Using a
- dictionary is most useful when the data to be compressed is short and can be
- predicted with good accuracy; the data can then be compressed better than
- with the default empty dictionary.
-
- Depending on the size of the compression data structures selected by
- deflateInit or deflateInit2, a part of the dictionary may in effect be
- discarded, for example if the dictionary is larger than the window size in
- deflate or deflate2. Thus the strings most likely to be useful should be
- put at the end of the dictionary, not at the front. In addition, the
- current implementation of deflate will use at most the window size minus
- 262 bytes of the provided dictionary.
-
- Upon return of this function, strm->adler is set to the adler32 value
- of the dictionary; the decompressor may later use this value to determine
- which dictionary has been used by the compressor. (The adler32 value
- applies to the whole dictionary even if only a subset of the dictionary is
- actually used by the compressor.) If a raw deflate was requested, then the
- adler32 value is not computed and strm->adler is not set.
-
- deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
- parameter is invalid (such as NULL dictionary) or the stream state is
- inconsistent (for example if deflate has already been called for this stream
- or if the compression method is bsort). deflateSetDictionary does not
- perform any compression: this will be done by deflate().
-*/
-
-ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
- z_streamp source));
-/*
- Sets the destination stream as a complete copy of the source stream.
-
- This function can be useful when several compression strategies will be
- tried, for example when there are several ways of pre-processing the input
- data with a filter. The streams that will be discarded should then be freed
- by calling deflateEnd. Note that deflateCopy duplicates the internal
- compression state which can be quite large, so this strategy is slow and
- can consume lots of memory.
-
- deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
- (such as zalloc being NULL). msg is left unchanged in both source and
- destination.
-*/
-
-ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
-/*
- This function is equivalent to deflateEnd followed by deflateInit,
- but does not free and reallocate all the internal compression state.
- The stream will keep the same compression level and any other attributes
- that may have been set by deflateInit2.
-
- deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent (such as zalloc or state being NULL).
-*/
-
-ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
- int level,
- int strategy));
-/*
- Dynamically update the compression level and compression strategy. The
- interpretation of level and strategy is as in deflateInit2. This can be
- used to switch between compression and straight copy of the input data, or
- to switch to a different kind of input data requiring a different
- strategy. If the compression level is changed, the input available so far
- is compressed with the old level (and may be flushed); the new level will
- take effect only at the next call of deflate().
-
- Before the call of deflateParams, the stream state must be set as for
- a call of deflate(), since the currently available input may have to
- be compressed and flushed. In particular, strm->avail_out must be non-zero.
-
- deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
- stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
- if strm->avail_out was zero.
-*/
-
-ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
- int good_length,
- int max_lazy,
- int nice_length,
- int max_chain));
-/*
- Fine tune deflate's internal compression parameters. This should only be
- used by someone who understands the algorithm used by zlib's deflate for
- searching for the best matching string, and even then only by the most
- fanatic optimizer trying to squeeze out the last compressed bit for their
- specific input data. Read the deflate.c source code for the meaning of the
- max_lazy, good_length, nice_length, and max_chain parameters.
-
- deflateTune() can be called after deflateInit() or deflateInit2(), and
- returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
- */
-
-ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
- uLong sourceLen));
-/*
- deflateBound() returns an upper bound on the compressed size after
- deflation of sourceLen bytes. It must be called after deflateInit()
- or deflateInit2(). This would be used to allocate an output buffer
- for deflation in a single pass, and so would be called before deflate().
-*/
-
-ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
- int bits,
- int value));
-/*
- deflatePrime() inserts bits in the deflate output stream. The intent
- is that this function is used to start off the deflate output with the
- bits leftover from a previous deflate stream when appending to it. As such,
- this function can only be used for raw deflate, and must be used before the
- first deflate() call after a deflateInit2() or deflateReset(). bits must be
- less than or equal to 16, and that many of the least significant bits of
- value will be inserted in the output.
-
- deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
- gz_headerp head));
-/*
- deflateSetHeader() provides gzip header information for when a gzip
- stream is requested by deflateInit2(). deflateSetHeader() may be called
- after deflateInit2() or deflateReset() and before the first call of
- deflate(). The text, time, os, extra field, name, and comment information
- in the provided gz_header structure are written to the gzip header (xflag is
- ignored -- the extra flags are set according to the compression level). The
- caller must assure that, if not Z_NULL, name and comment are terminated with
- a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
- available there. If hcrc is true, a gzip header crc is included. Note that
- the current versions of the command-line version of gzip (up through version
- 1.3.x) do not support header crc's, and will report that it is a "multi-part
- gzip file" and give up.
-
- If deflateSetHeader is not used, the default gzip header has text false,
- the time set to zero, and os set to 255, with no extra, name, or comment
- fields. The gzip header is returned to the default state by deflateReset().
-
- deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-/*
-ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
- int windowBits));
-
- This is another version of inflateInit with an extra parameter. The
- fields next_in, avail_in, zalloc, zfree and opaque must be initialized
- before by the caller.
-
- The windowBits parameter is the base two logarithm of the maximum window
- size (the size of the history buffer). It should be in the range 8..15 for
- this version of the library. The default value is 15 if inflateInit is used
- instead. windowBits must be greater than or equal to the windowBits value
- provided to deflateInit2() while compressing, or it must be equal to 15 if
- deflateInit2() was not used. If a compressed stream with a larger window
- size is given as input, inflate() will return with the error code
- Z_DATA_ERROR instead of trying to allocate a larger window.
-
- windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
- determines the window size. inflate() will then process raw deflate data,
- not looking for a zlib or gzip header, not generating a check value, and not
- looking for any check values for comparison at the end of the stream. This
- is for use with other formats that use the deflate compressed data format
- such as zip. Those formats provide their own check values. If a custom
- format is developed using the raw deflate format for compressed data, it is
- recommended that a check value such as an adler32 or a crc32 be applied to
- the uncompressed data as is done in the zlib, gzip, and zip formats. For
- most applications, the zlib format should be used as is. Note that comments
- above on the use in deflateInit2() applies to the magnitude of windowBits.
-
- windowBits can also be greater than 15 for optional gzip decoding. Add
- 32 to windowBits to enable zlib and gzip decoding with automatic header
- detection, or add 16 to decode only the gzip format (the zlib format will
- return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is
- a crc32 instead of an adler32.
-
- inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
- is set to null if there is no error message. inflateInit2 does not perform
- any decompression apart from reading the zlib header if present: this will
- be done by inflate(). (So next_in and avail_in may be modified, but next_out
- and avail_out are unchanged.)
-*/
-
-ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
- const Bytef *dictionary,
- uInt dictLength));
-/*
- Initializes the decompression dictionary from the given uncompressed byte
- sequence. This function must be called immediately after a call of inflate,
- if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
- can be determined from the adler32 value returned by that call of inflate.
- The compressor and decompressor must use exactly the same dictionary (see
- deflateSetDictionary). For raw inflate, this function can be called
- immediately after inflateInit2() or inflateReset() and before any call of
- inflate() to set the dictionary. The application must insure that the
- dictionary that was used for compression is provided.
-
- inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
- parameter is invalid (such as NULL dictionary) or the stream state is
- inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
- expected one (incorrect adler32 value). inflateSetDictionary does not
- perform any decompression: this will be done by subsequent calls of
- inflate().
-*/
-
-ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
-/*
- Skips invalid compressed data until a full flush point (see above the
- description of deflate with Z_FULL_FLUSH) can be found, or until all
- available input is skipped. No output is provided.
-
- inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR
- if no more input was provided, Z_DATA_ERROR if no flush point has been found,
- or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
- case, the application may save the current current value of total_in which
- indicates where valid compressed data was found. In the error case, the
- application may repeatedly call inflateSync, providing more input each time,
- until success or end of the input data.
-*/
-
-ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
- z_streamp source));
-/*
- Sets the destination stream as a complete copy of the source stream.
-
- This function can be useful when randomly accessing a large stream. The
- first pass through the stream can periodically record the inflate state,
- allowing restarting inflate at those points when randomly accessing the
- stream.
-
- inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
- (such as zalloc being NULL). msg is left unchanged in both source and
- destination.
-*/
-
-ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
-/*
- This function is equivalent to inflateEnd followed by inflateInit,
- but does not free and reallocate all the internal decompression state.
- The stream will keep attributes that may have been set by inflateInit2.
-
- inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent (such as zalloc or state being NULL).
-*/
-
-ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
- int bits,
- int value));
-/*
- This function inserts bits in the inflate input stream. The intent is
- that this function is used to start inflating at a bit position in the
- middle of a byte. The provided bits will be used before any bytes are used
- from next_in. This function should only be used with raw inflate, and
- should be used before the first inflate() call after inflateInit2() or
- inflateReset(). bits must be less than or equal to 16, and that many of the
- least significant bits of value will be inserted in the input.
-
- inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
- gz_headerp head));
-/*
- inflateGetHeader() requests that gzip header information be stored in the
- provided gz_header structure. inflateGetHeader() may be called after
- inflateInit2() or inflateReset(), and before the first call of inflate().
- As inflate() processes the gzip stream, head->done is zero until the header
- is completed, at which time head->done is set to one. If a zlib stream is
- being decoded, then head->done is set to -1 to indicate that there will be
- no gzip header information forthcoming. Note that Z_BLOCK can be used to
- force inflate() to return immediately after header processing is complete
- and before any actual data is decompressed.
-
- The text, time, xflags, and os fields are filled in with the gzip header
- contents. hcrc is set to true if there is a header CRC. (The header CRC
- was valid if done is set to one.) If extra is not Z_NULL, then extra_max
- contains the maximum number of bytes to write to extra. Once done is true,
- extra_len contains the actual extra field length, and extra contains the
- extra field, or that field truncated if extra_max is less than extra_len.
- If name is not Z_NULL, then up to name_max characters are written there,
- terminated with a zero unless the length is greater than name_max. If
- comment is not Z_NULL, then up to comm_max characters are written there,
- terminated with a zero unless the length is greater than comm_max. When
- any of extra, name, or comment are not Z_NULL and the respective field is
- not present in the header, then that field is set to Z_NULL to signal its
- absence. This allows the use of deflateSetHeader() with the returned
- structure to duplicate the header. However if those fields are set to
- allocated memory, then the application will need to save those pointers
- elsewhere so that they can be eventually freed.
-
- If inflateGetHeader is not used, then the header information is simply
- discarded. The header is always checked for validity, including the header
- CRC if present. inflateReset() will reset the process to discard the header
- information. The application would need to call inflateGetHeader() again to
- retrieve the header from the next gzip stream.
-
- inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-/*
-ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
- unsigned char FAR *window));
-
- Initialize the internal stream state for decompression using inflateBack()
- calls. The fields zalloc, zfree and opaque in strm must be initialized
- before the call. If zalloc and zfree are Z_NULL, then the default library-
- derived memory allocation routines are used. windowBits is the base two
- logarithm of the window size, in the range 8..15. window is a caller
- supplied buffer of that size. Except for special applications where it is
- assured that deflate was used with small window sizes, windowBits must be 15
- and a 32K byte window must be supplied to be able to decompress general
- deflate streams.
-
- See inflateBack() for the usage of these routines.
-
- inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
- the paramaters are invalid, Z_MEM_ERROR if the internal state could not
- be allocated, or Z_VERSION_ERROR if the version of the library does not
- match the version of the header file.
-*/
-
-typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
-typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
-
-ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
- in_func in, void FAR *in_desc,
- out_func out, void FAR *out_desc));
-/*
- inflateBack() does a raw inflate with a single call using a call-back
- interface for input and output. This is more efficient than inflate() for
- file i/o applications in that it avoids copying between the output and the
- sliding window by simply making the window itself the output buffer. This
- function trusts the application to not change the output buffer passed by
- the output function, at least until inflateBack() returns.
-
- inflateBackInit() must be called first to allocate the internal state
- and to initialize the state with the user-provided window buffer.
- inflateBack() may then be used multiple times to inflate a complete, raw
- deflate stream with each call. inflateBackEnd() is then called to free
- the allocated state.
-
- A raw deflate stream is one with no zlib or gzip header or trailer.
- This routine would normally be used in a utility that reads zip or gzip
- files and writes out uncompressed files. The utility would decode the
- header and process the trailer on its own, hence this routine expects
- only the raw deflate stream to decompress. This is different from the
- normal behavior of inflate(), which expects either a zlib or gzip header and
- trailer around the deflate stream.
-
- inflateBack() uses two subroutines supplied by the caller that are then
- called by inflateBack() for input and output. inflateBack() calls those
- routines until it reads a complete deflate stream and writes out all of the
- uncompressed data, or until it encounters an error. The function's
- parameters and return types are defined above in the in_func and out_func
- typedefs. inflateBack() will call in(in_desc, &buf) which should return the
- number of bytes of provided input, and a pointer to that input in buf. If
- there is no input available, in() must return zero--buf is ignored in that
- case--and inflateBack() will return a buffer error. inflateBack() will call
- out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out()
- should return zero on success, or non-zero on failure. If out() returns
- non-zero, inflateBack() will return with an error. Neither in() nor out()
- are permitted to change the contents of the window provided to
- inflateBackInit(), which is also the buffer that out() uses to write from.
- The length written by out() will be at most the window size. Any non-zero
- amount of input may be provided by in().
-
- For convenience, inflateBack() can be provided input on the first call by
- setting strm->next_in and strm->avail_in. If that input is exhausted, then
- in() will be called. Therefore strm->next_in must be initialized before
- calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called
- immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in
- must also be initialized, and then if strm->avail_in is not zero, input will
- initially be taken from strm->next_in[0 .. strm->avail_in - 1].
-
- The in_desc and out_desc parameters of inflateBack() is passed as the
- first parameter of in() and out() respectively when they are called. These
- descriptors can be optionally used to pass any information that the caller-
- supplied in() and out() functions need to do their job.
-
- On return, inflateBack() will set strm->next_in and strm->avail_in to
- pass back any unused input that was provided by the last in() call. The
- return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
- if in() or out() returned an error, Z_DATA_ERROR if there was a format
- error in the deflate stream (in which case strm->msg is set to indicate the
- nature of the error), or Z_STREAM_ERROR if the stream was not properly
- initialized. In the case of Z_BUF_ERROR, an input or output error can be
- distinguished using strm->next_in which will be Z_NULL only if in() returned
- an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to
- out() returning non-zero. (in() will always be called before out(), so
- strm->next_in is assured to be defined if out() returns non-zero.) Note
- that inflateBack() cannot return Z_OK.
-*/
-
-ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
-/*
- All memory allocated by inflateBackInit() is freed.
-
- inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
- state was inconsistent.
-*/
-
-ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
-/* Return flags indicating compile-time options.
-
- Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
- 1.0: size of uInt
- 3.2: size of uLong
- 5.4: size of voidpf (pointer)
- 7.6: size of z_off_t
-
- Compiler, assembler, and debug options:
- 8: DEBUG
- 9: ASMV or ASMINF -- use ASM code
- 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
- 11: 0 (reserved)
-
- One-time table building (smaller code, but not thread-safe if true):
- 12: BUILDFIXED -- build static block decoding tables when needed
- 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
- 14,15: 0 (reserved)
-
- Library content (indicates missing functionality):
- 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
- deflate code when not needed)
- 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
- and decode gzip streams (to avoid linking crc code)
- 18-19: 0 (reserved)
-
- Operation variations (changes in library functionality):
- 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
- 21: FASTEST -- deflate algorithm with only one, lowest compression level
- 22,23: 0 (reserved)
-
- The sprintf variant used by gzprintf (zero is best):
- 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
- 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
- 26: 0 = returns value, 1 = void -- 1 means inferred string length returned
-
- Remainder:
- 27-31: 0 (reserved)
- */
-
-
- /* utility functions */
-
-/*
- The following utility functions are implemented on top of the
- basic stream-oriented functions. To simplify the interface, some
- default options are assumed (compression level and memory usage,
- standard memory allocation functions). The source code of these
- utility functions can easily be modified if you need special options.
-*/
-
-ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen,
- const Bytef *source, uLong sourceLen));
-/*
- Compresses the source buffer into the destination buffer. sourceLen is
- the byte length of the source buffer. Upon entry, destLen is the total
- size of the destination buffer, which must be at least the value returned
- by compressBound(sourceLen). Upon exit, destLen is the actual size of the
- compressed buffer.
- This function can be used to compress a whole file at once if the
- input file is mmap'ed.
- compress returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_BUF_ERROR if there was not enough room in the output
- buffer.
-*/
-
-ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen,
- const Bytef *source, uLong sourceLen,
- int level));
-/*
- Compresses the source buffer into the destination buffer. The level
- parameter has the same meaning as in deflateInit. sourceLen is the byte
- length of the source buffer. Upon entry, destLen is the total size of the
- destination buffer, which must be at least the value returned by
- compressBound(sourceLen). Upon exit, destLen is the actual size of the
- compressed buffer.
-
- compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_BUF_ERROR if there was not enough room in the output buffer,
- Z_STREAM_ERROR if the level parameter is invalid.
-*/
-
-ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
-/*
- compressBound() returns an upper bound on the compressed size after
- compress() or compress2() on sourceLen bytes. It would be used before
- a compress() or compress2() call to allocate the destination buffer.
-*/
-
-ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen,
- const Bytef *source, uLong sourceLen));
-/*
- Decompresses the source buffer into the destination buffer. sourceLen is
- the byte length of the source buffer. Upon entry, destLen is the total
- size of the destination buffer, which must be large enough to hold the
- entire uncompressed data. (The size of the uncompressed data must have
- been saved previously by the compressor and transmitted to the decompressor
- by some mechanism outside the scope of this compression library.)
- Upon exit, destLen is the actual size of the compressed buffer.
- This function can be used to decompress a whole file at once if the
- input file is mmap'ed.
-
- uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_BUF_ERROR if there was not enough room in the output
- buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.
-*/
-
-
-typedef voidp gzFile;
-
-ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
-/*
- Opens a gzip (.gz) file for reading or writing. The mode parameter
- is as in fopen ("rb" or "wb") but can also include a compression level
- ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for
- Huffman only compression as in "wb1h", or 'R' for run-length encoding
- as in "wb1R". (See the description of deflateInit2 for more information
- about the strategy parameter.)
-
- gzopen can be used to read a file which is not in gzip format; in this
- case gzread will directly read from the file without decompression.
-
- gzopen returns NULL if the file could not be opened or if there was
- insufficient memory to allocate the (de)compression state; errno
- can be checked to distinguish the two cases (if errno is zero, the
- zlib error is Z_MEM_ERROR). */
-
-ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
-/*
- gzdopen() associates a gzFile with the file descriptor fd. File
- descriptors are obtained from calls like open, dup, creat, pipe or
- fileno (in the file has been previously opened with fopen).
- The mode parameter is as in gzopen.
- The next call of gzclose on the returned gzFile will also close the
- file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
- descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
- gzdopen returns NULL if there was insufficient memory to allocate
- the (de)compression state.
-*/
-
-ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
-/*
- Dynamically update the compression level or strategy. See the description
- of deflateInit2 for the meaning of these parameters.
- gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
- opened for writing.
-*/
-
-ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
-/*
- Reads the given number of uncompressed bytes from the compressed file.
- If the input file was not in gzip format, gzread copies the given number
- of bytes into the buffer.
- gzread returns the number of uncompressed bytes actually read (0 for
- end of file, -1 for error). */
-
-ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
- voidpc buf, unsigned len));
-/*
- Writes the given number of uncompressed bytes into the compressed file.
- gzwrite returns the number of uncompressed bytes actually written
- (0 in case of error).
-*/
-
-ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...));
-/*
- Converts, formats, and writes the args to the compressed file under
- control of the format string, as in fprintf. gzprintf returns the number of
- uncompressed bytes actually written (0 in case of error). The number of
- uncompressed bytes written is limited to 4095. The caller should assure that
- this limit is not exceeded. If it is exceeded, then gzprintf() will return
- return an error (0) with nothing written. In this case, there may also be a
- buffer overflow with unpredictable consequences, which is possible only if
- zlib was compiled with the insecure functions sprintf() or vsprintf()
- because the secure snprintf() or vsnprintf() functions were not available.
-*/
-
-ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
-/*
- Writes the given null-terminated string to the compressed file, excluding
- the terminating null character.
- gzputs returns the number of characters written, or -1 in case of error.
-*/
-
-ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
-/*
- Reads bytes from the compressed file until len-1 characters are read, or
- a newline character is read and transferred to buf, or an end-of-file
- condition is encountered. The string is then terminated with a null
- character.
- gzgets returns buf, or Z_NULL in case of error.
-*/
-
-ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
-/*
- Writes c, converted to an unsigned char, into the compressed file.
- gzputc returns the value that was written, or -1 in case of error.
-*/
-
-ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
-/*
- Reads one byte from the compressed file. gzgetc returns this byte
- or -1 in case of end of file or error.
-*/
-
-ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
-/*
- Push one character back onto the stream to be read again later.
- Only one character of push-back is allowed. gzungetc() returns the
- character pushed, or -1 on failure. gzungetc() will fail if a
- character has been pushed but not read yet, or if c is -1. The pushed
- character will be discarded if the stream is repositioned with gzseek()
- or gzrewind().
-*/
-
-ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
-/*
- Flushes all pending output into the compressed file. The parameter
- flush is as in the deflate() function. The return value is the zlib
- error number (see function gzerror below). gzflush returns Z_OK if
- the flush parameter is Z_FINISH and all output could be flushed.
- gzflush should be called only when strictly necessary because it can
- degrade compression.
-*/
-
-ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
- z_off_t offset, int whence));
-/*
- Sets the starting position for the next gzread or gzwrite on the
- given compressed file. The offset represents a number of bytes in the
- uncompressed data stream. The whence parameter is defined as in lseek(2);
- the value SEEK_END is not supported.
- If the file is opened for reading, this function is emulated but can be
- extremely slow. If the file is opened for writing, only forward seeks are
- supported; gzseek then compresses a sequence of zeroes up to the new
- starting position.
-
- gzseek returns the resulting offset location as measured in bytes from
- the beginning of the uncompressed stream, or -1 in case of error, in
- particular if the file is opened for writing and the new starting position
- would be before the current position.
-*/
-
-ZEXTERN int ZEXPORT gzrewind OF((gzFile file));
-/*
- Rewinds the given file. This function is supported only for reading.
-
- gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
-*/
-
-ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file));
-/*
- Returns the starting position for the next gzread or gzwrite on the
- given compressed file. This position represents a number of bytes in the
- uncompressed data stream.
-
- gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
-*/
-
-ZEXTERN int ZEXPORT gzeof OF((gzFile file));
-/*
- Returns 1 when EOF has previously been detected reading the given
- input stream, otherwise zero.
-*/
-
-ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
-/*
- Returns 1 if file is being read directly without decompression, otherwise
- zero.
-*/
-
-ZEXTERN int ZEXPORT gzclose OF((gzFile file));
-/*
- Flushes all pending output if necessary, closes the compressed file
- and deallocates all the (de)compression state. The return value is the zlib
- error number (see function gzerror below).
-*/
-
-ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
-/*
- Returns the error message for the last error which occurred on the
- given compressed file. errnum is set to zlib error number. If an
- error occurred in the file system and not in the compression library,
- errnum is set to Z_ERRNO and the application may consult errno
- to get the exact error code.
-*/
-
-ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
-/*
- Clears the error and end-of-file flags for file. This is analogous to the
- clearerr() function in stdio. This is useful for continuing to read a gzip
- file that is being written concurrently.
-*/
-
- /* checksum functions */
-
-/*
- These functions are not related to compression but are exported
- anyway because they might be useful in applications using the
- compression library.
-*/
-
-ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
-/*
- Update a running Adler-32 checksum with the bytes buf[0..len-1] and
- return the updated checksum. If buf is NULL, this function returns
- the required initial value for the checksum.
- An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
- much faster. Usage example:
-
- uLong adler = adler32(0L, Z_NULL, 0);
-
- while (read_buffer(buffer, length) != EOF) {
- adler = adler32(adler, buffer, length);
- }
- if (adler != original_adler) error();
-*/
-
-ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
- z_off_t len2));
-/*
- Combine two Adler-32 checksums into one. For two sequences of bytes, seq1
- and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
- each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of
- seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.
-*/
-
-ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
-/*
- Update a running CRC-32 with the bytes buf[0..len-1] and return the
- updated CRC-32. If buf is NULL, this function returns the required initial
- value for the for the crc. Pre- and post-conditioning (one's complement) is
- performed within this function so it shouldn't be done by the application.
- Usage example:
-
- uLong crc = crc32(0L, Z_NULL, 0);
-
- while (read_buffer(buffer, length) != EOF) {
- crc = crc32(crc, buffer, length);
- }
- if (crc != original_crc) error();
-*/
-
-ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
-
-/*
- Combine two CRC-32 check values into one. For two sequences of bytes,
- seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
- calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32
- check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
- len2.
-*/
-
-
- /* various hacks, don't look :) */
-
-/* deflateInit and inflateInit are macros to allow checking the zlib version
- * and the compiler's view of z_stream:
- */
-ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
- const char *version, int stream_size));
-ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
- const char *version, int stream_size));
-ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method,
- int windowBits, int memLevel,
- int strategy, const char *version,
- int stream_size));
-ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits,
- const char *version, int stream_size));
-ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
- unsigned char FAR *window,
- const char *version,
- int stream_size));
-#define deflateInit(strm, level) \
- deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream))
-#define inflateInit(strm) \
- inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream))
-#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
- deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
- (strategy), ZLIB_VERSION, sizeof(z_stream))
-#define inflateInit2(strm, windowBits) \
- inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
-#define inflateBackInit(strm, windowBits, window) \
- inflateBackInit_((strm), (windowBits), (window), \
- ZLIB_VERSION, sizeof(z_stream))
-
-
-#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
- struct internal_state {int dummy;}; /* hack for buggy compilers */
-#endif
-
-ZEXTERN const char * ZEXPORT zError OF((int));
-ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z));
-ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void));
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* ZLIB_H */

--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rsem.git