Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 20 Apr 2018 01:34:05 +0000 (UTC)
From:      "Jason W. Bacon" <jwb@FreeBSD.org>
To:        ports-committers@freebsd.org, svn-ports-all@freebsd.org, svn-ports-head@freebsd.org
Subject:   svn commit: r467808 - in head/biology: . ddocent ddocent/files
Message-ID:  <201804200134.w3K1Y50u069030@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jwb
Date: Fri Apr 20 01:34:05 2018
New Revision: 467808
URL: https://svnweb.freebsd.org/changeset/ports/467808

Log:
  biology/ddocent: Bash pipeline for RAD sequencing
  
  Approved by:    jrm (mentor)
  Differential Revision:  https://reviews.freebsd.org/D15139

Added:
  head/biology/ddocent/
  head/biology/ddocent/Makefile   (contents, props changed)
  head/biology/ddocent/distinfo   (contents, props changed)
  head/biology/ddocent/files/
  head/biology/ddocent/files/ddocent-assembly-test   (contents, props changed)
  head/biology/ddocent/files/ddocent-assembly-test-cleanup   (contents, props changed)
  head/biology/ddocent/files/patch-dDocent   (contents, props changed)
  head/biology/ddocent/pkg-descr   (contents, props changed)
  head/biology/ddocent/pkg-plist   (contents, props changed)
Modified:
  head/biology/Makefile

Modified: head/biology/Makefile
==============================================================================
--- head/biology/Makefile	Fri Apr 20 00:44:52 2018	(r467807)
+++ head/biology/Makefile	Fri Apr 20 01:34:05 2018	(r467808)
@@ -22,6 +22,7 @@
     SUBDIR += clustalw
     SUBDIR += consed
     SUBDIR += crux
+    SUBDIR += ddocent
     SUBDIR += diamond
     SUBDIR += emboss
     SUBDIR += fasta

Added: head/biology/ddocent/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/Makefile	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,67 @@
+# $FreeBSD$
+
+PORTNAME=	dDocent
+DISTVERSIONPREFIX=	v
+DISTVERSION=	2.2.25
+CATEGORIES=	biology java
+
+MAINTAINER=	jwb@FreeBSD.org
+COMMENT=	Bash pipeline for RAD sequencing
+
+LICENSE=	MIT
+
+# ddocent test data do not unpack with FreeBSD 11.1 /usr/bin/unzip
+RUN_DEPENDS=	unzip>=0:archivers/unzip \
+		mawk>=0:lang/mawk \
+		gawk>=0:lang/gawk \
+		coreutils>=0:sysutils/coreutils \
+		gnuplot>=0:math/gnuplot \
+		parallel>=0:sysutils/parallel \
+		bash:shells/bash \
+		bwa>=0.7.13:biology/bwa \
+		cd-hit>=0:biology/cd-hit \
+		samtools>=1.3:biology/samtools \
+		vcftools>=0.1.15:biology/vcftools \
+		trimmomatic>=0:biology/trimmomatic \
+		bamtools>=0:biology/bamtools \
+		stacks>=0:biology/stacks \
+		rainbow>=0:biology/rainbow \
+		trimadap>=0:biology/trimadap \
+		seqtk>=0:biology/seqtk \
+		bedtools>=2.26.0:biology/bedtools \
+		pear-merger>=0:biology/pear-merger \
+		vcflib>=0:biology/vcflib \
+		freebayes:biology/freebayes
+
+USES=		perl5 python shebangfix
+SHEBANG_FILES=	dDocent scripts/*.sh scripts/*.pl scripts/dDocent_filters
+USE_JAVA=	yes
+USE_GITHUB=	yes
+GH_ACCOUNT=	jpuritz
+
+NO_BUILD=	yes
+NO_ARCH=	yes
+
+# These are on top of patch-dDocent, so don't apply them within the source
+# tree, or they'll get picked up by patch generators, and hard-code PREFIX.
+post-install:
+	${REINPLACE_CMD} -i '' \
+		-e 's|%%PREFIX%%|${PREFIX}|g' \
+		-e 's|%%JAVAJARDIR%%|${JAVAJARDIR}|g' \
+		-e 's|%%BASH%%|${LOCALBASE}/bin/bash|g' \
+		-e 's|python|${PYTHON_CMD}|g' \
+		${STAGEDIR}${PREFIX}/bin/dDocent
+
+do-install:
+	${MKDIR} ${STAGEDIR}${PREFIX}/bin
+	${INSTALL_SCRIPT} \
+		${WRKSRC}/dDocent \
+		${WRKSRC}/*.sh \
+		${FILESDIR}/ddocent-assembly-test \
+		${FILESDIR}/ddocent-assembly-test-cleanup \
+		${WRKSRC}/scripts/*.sh \
+		${WRKSRC}/scripts/*.pl \
+		${WRKSRC}/scripts/dDocent_filters \
+		${STAGEDIR}${PREFIX}/bin
+
+.include <bsd.port.mk>

Added: head/biology/ddocent/distinfo
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/distinfo	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,3 @@
+TIMESTAMP = 1520345850
+SHA256 (jpuritz-dDocent-v2.2.25_GH0.tar.gz) = 903c3010b29b2ca95f7fe6099925948e4d3f21655668caff653df97dfa7ecf44
+SIZE (jpuritz-dDocent-v2.2.25_GH0.tar.gz) = 336804

Added: head/biology/ddocent/files/ddocent-assembly-test
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/files/ddocent-assembly-test	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,380 @@
+#!/usr/bin/env bash
+
+set -e
+
+##########################################################################
+#   Function description:
+#       Pause until user presses return
+##########################################################################
+
+pause()
+{
+    local junk
+    
+    printf "Press return to continue..."
+    read junk
+}
+
+
+##########################################################################
+#   Not necessary if invoking scripts with "bash script-name" as shown
+#   in the dDocent tutorial, but supplied for completeness.  Tutorial
+#   scripts have also been patched upstream to usr /usr/bin/env, but
+#   we won't assume they'll all stay that way.
+##########################################################################
+
+fix_bash_path()
+{
+    # Don't echo these commands
+    { set +x; } 2>/dev/null
+    if [ $# != 1 ]; then
+	printf "Usage: $0 script\n"
+	exit 1
+    fi
+    sed -i.bak -e "s|#!/bin/bash|#!/usr/bin/env bash|g" $1
+    set -x
+}
+
+
+##########################################################################
+#   Main
+##########################################################################
+
+##########################################################################
+#   Workarounds for running dDocent from FreeBSD ports and Pkgsrc.
+#   Include this section in any scripts running dDocent commands.
+##########################################################################
+
+# Hack to allow remake_reference.sh, etc. to find trimmomatic.
+# trimmomatic.jar and adapter files are not an executables and should not
+# be in PATH according to filesystem hierarchy standards, but the dDocent
+# scripts are coded to look for them there, because that's how dDocent
+# installs the bundled Trimmomatic.
+if [ -e /usr/local/share/java/classes/trimmomatic.jar ]; then
+    export PATH=${PATH}:/usr/local/share/java/classes:/usr/local/share/trimmomatic/adapters
+elif [ -e $PKGSRC/lib/java/trimmomatic.jar ]; then
+    export PATH=${PATH}:$PKGSRC/lib/java:$PKGSRC/share/trimmomatic/adapters
+else
+    printf "Error: Trimmomatic is not installed in a known location.\n" >> /dev/stderr
+fi
+
+if ! pwd | fgrep dDocent-test; then
+    mkdir -p dDocent-test
+    cd dDocent-test
+fi
+
+# remake_reference.sh and some other scripts use GNU extensions in awk
+# commands.  Trick systems into using gawk to get around this without
+# having to patch every script.
+mkdir -p ddocent-bin
+ln -sf `which gawk` ddocent-bin/awk
+ln -sf `which python2.7` ddocent-bin/python
+export PATH=`pwd`/ddocent-bin:$PATH
+
+##########################################################################
+#   Actual dDocent commands below
+##########################################################################
+
+##########################################################################
+#   Command sequence from the dDocent tutorial on Github.  See link below.
+##########################################################################
+
+cat << EOM
+
+This script runs the test commands described at
+
+http://ddocent.com/assembly
+
+You may want to follow along on this web page as the script runs.  It will
+pause after each step to allow checking the output.
+
+EOM
+pause
+
+mkdir -p Data
+cd Data
+
+printf "Downloading and unpacking data.zip...\n"
+if [ -e data.zip ]; then
+    mv data.zip /tmp/dDocent-data.zip
+    rm -rf *
+    mv /tmp/dDocent-data.zip data.zip
+else
+    rm -rf *
+    curl --insecure -L -o data.zip \
+	'https://www.dropbox.com/s/t09xjuudev4de72/data.zip?dl=0'
+fi
+
+# Hack: current data.zip contains data.zip.  ??!
+if [ ! -e simRRLs2.py ]; then
+    yes | unzip data.zip || true
+fi
+ls -l
+pause
+
+set -x
+head SimRAD.barcodes
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cut -f2 SimRAD.barcodes > barcodes
+head barcodes
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+process_radtags -1 SimRAD_R1.fastq.gz -2 SimRAD_R2.fastq.gz -b barcodes \
+    -e ecoRI --renz_2 mspI -r -i gzfastq
+ls | fgrep sample_ | head
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rm *rem*
+{ set +x; } 2>/dev/null
+pause
+
+rm -f Rename_for_dDocent.sh     # Always get the latest
+set -x
+curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/Rename_for_dDocent.sh
+more Rename_for_dDocent.sh
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+bash Rename_for_dDocent.sh SimRAD.barcodes
+{ set +x; } 2>/dev/null
+
+set -x
+ls *.fq.gz
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+ls *.F.fq.gz > namelist
+sed -i'' -e 's/.F.fq.gz//g' namelist
+AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
+AWK2='!/>/'
+AWK3='!/NNN/'
+PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
+
+cat namelist | parallel --no-notice -j 8 "zcat {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
+cat namelist | parallel --no-notice -j 8 "zcat {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
+cat namelist | parallel --no-notice -j 8 "paste -d '-' {}.forward {}.reverse | mawk '$AWK3' | sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cat *.uniq.seqs > uniq.seqs
+for i in {2..20};
+do 
+echo $i >> pfile
+done
+cat pfile | parallel --no-notice \
+    "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniq.seqs |  wc -l" \
+    | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.data
+rm pfile
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+more uniqseq.data
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+gnuplot << \EOF 
+set terminal dumb size 80, 30
+set autoscale
+set xrange [2:20] 
+unset label
+set title "Number of Unique Sequences with More than X Coverage (Counted within individuals)"
+set xlabel "Coverage"
+set ylabel "Number of Unique Sequences"
+plot 'uniqseq.data' with lines notitle
+pause -1
+EOF
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+parallel --no-notice -j 8 mawk -v x=4 \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
+wc -l uniqCperindv
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+for ((i = 2; i <= 10; i++));
+do
+echo $i >> ufile
+done
+
+cat ufile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniqCperindv | wc -l" | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.peri.data
+rm ufile
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+gnuplot << \EOF 
+set terminal dumb size 80, 30
+set autoscale 
+unset label
+set title "Number of Unique Sequences present in more than X Individuals"
+set xlabel "Number of Individuals"
+set ylabel "Number of Unique Sequences"
+plot 'uniqseq.peri.data' with lines notitle
+pause -1
+EOF
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk -v x=4 '$1 >= x' uniqCperindv > uniq.k.4.c.4.seqs
+wc -l uniq.k.4.c.4.seqs
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cut -f2 uniq.k.4.c.4.seqs > totaluniqseq
+mawk '{c= c + 1; print ">Contig_" c "\n" $1}' totaluniqseq > uniq.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cd-hit-est -i uniq.F.fasta -o xxx -c 0.8 -T 0 -M 0 -g 1
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>Contig_,...]//g' | sort -g -k1 > sort.contig.cluster.ids
+paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
+sort -k2,2 -g contig.cluster.totaluniqseq | sed -e 's/NNNNNNNNNN/\t/g' > rcluster
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+head rcluster
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cut -f2 rcluster | uniq | wc -l
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow div -i rcluster -o rbdiv.out
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow merge -o rbasm.out -a -i rbdiv.out
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
+{ set +x; } 2>/dev/null
+pause
+
+# If missing fdesc mount for bash, <(echo "E") will not work
+# cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
+set -x
+echo "E" > endfile
+cat rbasm.out endfile |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
+if (NR == 1) e=$2;
+else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
+else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
+else if ($1 ~/C/) clus=$2;
+else if ($1 ~/L/) len=$2;
+else if ($1 ~/S/) seq=$2;
+else if ($1 ~/N/) freq=$2;
+else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
+else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
+else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
+}' > rainbow.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cd-hit-est -i rainbow.fasta -o referenceRC.fasta -M 0 -T 0 -c 0.9
+{ set +x; } 2>/dev/null
+pause
+
+rm -f remake_reference.sh
+set -x
+curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/remake_reference.sh
+more remake_reference.sh
+#fix_bash_path remake_reference.sh
+
+bash remake_reference.sh 4 4 0.90 PE 2
+{ set +x; } 2>/dev/null
+pause
+
+rm -f ReferenceOpt.sh
+set -x
+curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/ReferenceOpt.sh
+more ReferenceOpt.sh
+
+bash ReferenceOpt.sh 4 8 4 8 PE 16
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+bash remake_reference.sh 4 4 0.90 PE 2
+head reference.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+wc -l reference.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk '/>/' reference.fasta | wc -l
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk '/^NAATT.*N*.*CCGN$/' reference.fasta | wc -l
+grep '^NAATT.*N*.*CCGN$' reference.fasta | wc -l
+{ set +x; } 2>/dev/null
+pause
+
+printf "Bonus Section: Optimize reference assemblies? (takes a long time) y/[n] "
+read bonus
+if [ 0$bonus = 0y ]; then
+    set -x
+    curl -L -O https://raw.githubusercontent.com/jpuritz/dDocent/master/scripts/RefMapOpt.sh
+    { set +x; } 2>/dev/null
+    printf "Running dDocent to trim reads.\n"
+    pause
+    set -x
+    cat << EOM | dDocent || true
+yes
+8
+10g
+yes
+no
+no
+no
+bacon@uwm.edu
+EOM
+    bash RefMapOpt.sh 4 8 4 8 0.9 64 PE
+    { set +x; } 2>/dev/null
+    pause
+    more mapping.results
+    pause
+fi

Added: head/biology/ddocent/files/ddocent-assembly-test-cleanup
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/files/ddocent-assembly-test-cleanup	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+
+if pwd | fgrep dDocent-test; then
+    rm -rf .*.bak Data ddocent-bin
+else
+    printf "$0 can only be run in a dDocent-test directory.\n"
+    exit 1
+fi

Added: head/biology/ddocent/files/patch-dDocent
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/files/patch-dDocent	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,134 @@
+--- dDocent.orig	2018-04-20 00:10:34 UTC
++++ dDocent
+@@ -1,6 +1,9 @@
+ #!/usr/local/bin/bash
+ export LC_ALL=en_US.UTF-8
+ 
++# GNU Parallel uses $SHELL and has issues with [t]csh
++export SHELL=%%BASH%%
++
+ ##########dDocent##########
+ VERSION='2.2.25'
+ #This script serves as an interactive bash wrapper to QC, assemble, map, and call SNPs from double digest RAD (SE or PE), ezRAD (SE or PE) data, or SE RAD data.
+@@ -27,15 +30,15 @@ do
+ 	fi
+ done
+ 
+-if find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null| grep -q 'trim' ; then
+-	TRIMMOMATIC=$(find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null | head -1)
++if [ -e %%JAVAJARDIR%%/trimmomatic.jar ]; then
++       TRIMMOMATIC=%%JAVAJARDIR%%/trimmomatic.jar
+ 	else
+     echo "The dependency trimmomatic is not installed or is not in your" '$PATH'"."
+     NUMDEP=$((NUMDEP + 1))
+ 	fi
+ 
+-if find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | grep -q 'Tru' ; then
+-	ADAPTERS=$(find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | head -1)
++if [ -e %%PREFIX%%/share/trimmomatic/adapters/TruSeq2-PE.fa ]; then
++       ADAPTERS=%%PREFIX%%/share/trimmomatic/adapters/TruSeq2-PE.fa
+ 	else
+     echo "The file listing adapters (included with trimmomatic) is not installed or is not in your" '$PATH'"."
+     NUMDEP=$((NUMDEP + 1))
+@@ -80,6 +83,7 @@ FREEB=(`freebayes | grep -oh 'v[0-9].*' 
+         	exit 1
+         fi         	
+ VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed 's/[a-z)]//')
++	echo $VCFTV
+ 	if [ "$VCFTV" -lt "10" ]; then
+         	echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent."
+         	echo "Please install at least version 0.1.11"
+@@ -89,7 +93,7 @@ VCFTV=$(vcftools | grep VCF | grep -oh '
+         elif [ "$VCFTV" -ge "12" ]; then
+                 VCFGTFLAG="--max-missing"
+ 	fi
+-BWAV=$(bwa 2>&1 | mawk '/Versi/' | sed 's/Version: //g' | sed 's/0.7.//g' | sed 's/-.*//g' | cut -c 1-2)
++BWAV=$(bwa 2>&1 | mawk '/Versi/' | sed 's/Version: //g' | sed 's/0.7.//g' | sed 's/a*-.*//g')
+ 	if [ "$BWAV" -lt "13" ]; then
+         	echo "The version of bwa installed in your" '$PATH' "is not optimized for dDocent."
+         	echo "Please install at least version 0.7.13"
+@@ -107,13 +111,12 @@ BTC=$( bedtools --version | mawk '{print
+ 		exit 1	
+ 	fi
+ 		
+-if ! awk --version | fgrep -v GNU &>/dev/null; then
++if ! awk --version | fgrep GNU &>/dev/null; then
+          awk=gawk
+     else
+          awk=awk
+ fi
+ 
+-
+ if [ $NUMDEP -gt 0 ]; then
+ 	echo -e "\nPlease install all required software before running dDocent again."
+ 	exit 1
+@@ -291,9 +294,9 @@ echo "Using BWA to map reads."
+         	for i in "${NAMES[@]}"
+         	do
+         	if [ -f "$i.R2.fq.gz" ]; then
+-        		bwa mem reference.fasta $i.R1.fq.gz $i.R2.fq.gz -L 20,5 -I $INSERT,$SD,$INSERTH,$INSERTL -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log
++        		bwa mem -L 20,5 -I $INSERT,$SD,$INSERTH,$INSERTL -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" reference.fasta $i.R1.fq.gz $i.R2.fq.gz 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log
+         	else
+-        		bwa mem reference.fasta $i.R1.fq.gz -L 20,5 -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log
++        		bwa mem -L 20,5 -t $NUMProc -a -M -T 10 -A $optA -B $optB -O $optO -R "@RG\tID:$i\tSM:$i\tPL:Illumina" reference.fasta $i.R1.fq.gz 2> bwa.$i.log | mawk '$6 !~/[2-9].[SH]/ && $6 !~ /[1-9][0-9].[SH]/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log
+         	fi
+         	samtools sort -@$NUMProc $i.bam -o $i.bam 
+ 		mv $i.bam $i-RG.bam
+@@ -388,10 +391,10 @@ if [ "$SNP" != "no" ]; then
+ 	}
+ 	export -f call_genos
+ 
+-	ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | shuf | parallel --env call_genos --memfree $MAXMemory -j $NUMProc --no-notice call_genos {}
++	ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | gshuf | parallel --env call_genos --memfree $MAXMemory -j $NUMProc --no-notice call_genos {}
+ ####	
+-	#ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | shuf | parallel --memfree $MAXMemory -j $FB1 --no-notice --delay 1 freebayes -L bamlist.list -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10
+-	#ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | shuf | parallel --memfree $MAXMemory -j $FB1 --no-notice "samtools view -b -L mapped.{}.bed | freebayes -c -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10"
++	#ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | gshuf | parallel --memfree $MAXMemory -j $FB1 --no-notice --delay 1 freebayes -L bamlist.list -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10
++	#ls mapped.*.bed | sed 's/mapped.//g' | sed 's/.bed//g' | gshuf | parallel --memfree $MAXMemory -j $FB1 --no-notice "samtools view -b -L mapped.{}.bed | freebayes -c -t mapped.{}.bed -v raw.{}.vcf -f reference.fasta -m 5 -q 5 -E 3 --min-repeat-entropy 1 -V --populations popmap -n 10"
+ 
+ 
+ 	rm mapped.*.bed 
+@@ -447,8 +450,8 @@ fi
+ 
+ #Function for trimming reads using trimmomatic
+ trim_reads(){
+-	TRIMMOMATIC=$(find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null | head -1)
+-    ADAPTERS=$(find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | head -1)
++       TRIMMOMATIC=%%JAVAJARDIR%%/trimmomatic.jar
++       ADAPTERS=%%PREFIX%%/share/trimmomatic/adapters/TruSeq2-PE.fa
+ 
+ 	if [ -f $1.R.fq.gz ]; then	
+ 		java -Xmx2g -jar $TRIMMOMATIC PE -threads 2 -phred33 $1.F.fq.gz $1.R.fq.gz $1.R1.fq.gz $1.unpairedF.fq.gz $1.R2.fq.gz $1.unpairedR.fq.gz ILLUMINACLIP:$ADAPTERS:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:5:10 $TW &> $1.trim.log
+@@ -747,7 +750,14 @@ else
+ fi
+ 
+ #Tries to get number of processors, if not asks user
+-NUMProc=( `grep -c ^processor /proc/cpuinfo 2> /dev/null` ) 
++if [ `uname` = Linux ]; then
++    NUMProc=( `grep -c ^processor /proc/cpuinfo 2> /dev/null` ) 
++elif [ `uname` = FreeBSD ]; then
++    NUMProc=( `sysctl -n hw.ncpu` ) 
++else
++    printf "Unsupported platform: `uname`\n"
++    exit 1
++fi
+ NUMProc=$(($NUMProc + 0)) 
+ 
+ echo "dDocent detects $NUMProc processors available on this system."
+@@ -764,7 +774,15 @@ if [ $NUMProc -lt 1 ]; then
+ fi
+ 
+ #Tries to get maximum system memory, if not asks user
+-MAXMemory=$(($(grep -Po '(?<=^MemTotal:)\s*[0-9]+' /proc/meminfo | tr -d " ") / 1048576))G
++if [ `uname` = Linux ]; then
++    MAXMemory=$(($(grep -Po '(?<=^MemTotal:)\s*[0-9]+' /proc/meminfo | tr -d " ") / 1048576))G
++elif [ `uname` = FreeBSD ]; then
++    MAXMemory=`sysctl -n hw.realmem`
++    MAXMemory=$((MAXMemory / 1073741824))G
++else
++    printf "Unsupported platform: `uname`\n"
++    exit 1
++fi
+ 
+ echo "dDocent detects $MAXMemory maximum memory available on this system."
+ echo "Please enter the maximum memory to use for this analysis. The size can be postfixed with 

Added: head/biology/ddocent/pkg-descr
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/pkg-descr	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,7 @@
+dDocent is simple bash wrapper to QC, assemble, map, and call SNPs from almost
+any kind of RAD sequencing. If you have a reference already, dDocent can be
+used to call SNPs from almost any type of NGS data set. It is designed to run
+on Linux based machines with large memory capacity and multiple processing
+cores, and it can be modified for use on HPC.
+
+WWW: http://ddocent.com

Added: head/biology/ddocent/pkg-plist
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/biology/ddocent/pkg-plist	Fri Apr 20 01:34:05 2018	(r467808)
@@ -0,0 +1,17 @@
+bin/ErrorCount.sh
+bin/RefMapOpt.sh
+bin/ReferenceOpt.hyb.sh
+bin/ReferenceOpt.sh
+bin/Rename_SequenceFiles.sh
+bin/Rename_for_dDocent.sh
+bin/Rename_for_dDocent_with-INDEX.sh
+bin/dDocent
+bin/dDocent_filters
+bin/dDocent_ngs.sh
+bin/ddocent-assembly-test
+bin/ddocent-assembly-test-cleanup
+bin/filter_hwe_by_pop.pl
+bin/filter_missing_ind.sh
+bin/pop_missing_filter.sh
+bin/remake_reference.sh
+bin/remove.bad.hap.loci.sh



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201804200134.w3K1Y50u069030>