From 94e269c682987a9ef2762503ba3d8e3a9e8926f7 Mon Sep 17 00:00:00 2001 From: bacon Date: Mon, 7 Jan 2019 02:33:17 +0000 Subject: [PATCH] biology/canu: import canu-1.8 Canu is a fork of the Celera Assembler, designed for high-noise single-molecule sequencing (such as the PacBio RS II/Sequel or Oxford Nanopore MinION). Canu is a hierarchical assembly pipeline which runs in four steps: Detect overlaps in high-noise sequences using MHAP Generate corrected sequence consensus Trim corrected sequences Assemble trimmed corrected sequences --- biology/canu/DESCR | 12 +++ biology/canu/Makefile | 80 ++++++++++++++++++ biology/canu/PLIST | 80 ++++++++++++++++++ biology/canu/distinfo | 10 +++ biology/canu/patches/patch-Makefile | 48 +++++++++++ .../patches/patch-pipelines_canu_Defaults.pm | 24 ++++++ .../patches/patch-pipelines_canu_Execution.pm | 83 +++++++++++++++++++ .../canu/patches/patch-utility_objectStore.C | 19 +++++ 8 files changed, 356 insertions(+) create mode 100644 biology/canu/DESCR create mode 100644 biology/canu/Makefile create mode 100644 biology/canu/PLIST create mode 100644 biology/canu/distinfo create mode 100644 biology/canu/patches/patch-Makefile create mode 100644 biology/canu/patches/patch-pipelines_canu_Defaults.pm create mode 100644 biology/canu/patches/patch-pipelines_canu_Execution.pm create mode 100644 biology/canu/patches/patch-utility_objectStore.C diff --git a/biology/canu/DESCR b/biology/canu/DESCR new file mode 100644 index 000000000000..719f42cf95ce --- /dev/null +++ b/biology/canu/DESCR @@ -0,0 +1,12 @@ +Canu is a fork of the Celera Assembler, designed for high-noise single-molecule +sequencing (such as the PacBio RS II/Sequel or Oxford Nanopore MinION). + +Canu is a hierarchical assembly pipeline which runs in four steps: + + Detect overlaps in high-noise sequences using MHAP + + Generate corrected sequence consensus + + Trim corrected sequences + + Assemble trimmed corrected sequences diff --git a/biology/canu/Makefile b/biology/canu/Makefile new file mode 100644 index 000000000000..db605ef37a57 --- /dev/null +++ b/biology/canu/Makefile @@ -0,0 +1,80 @@ +# $NetBSD: Makefile,v 1.1 2019/01/07 02:33:17 bacon Exp $ + +DISTNAME= canu-1.8 +CATEGORIES= biology java +MASTER_SITES= ${MASTER_SITE_GITHUB:=marbl/} +GITHUB_TAG= v${PKGVERSION_NOREV} + +MAINTAINER= bacon@NetBSD.org +HOMEPAGE= http://canu.readthedocs.io/ +COMMENT= Single molecule sequence assembler for genomes large and small +LICENSE= gnu-gpl-v2 + +DEPENDS+= gnuplot>0:../../graphics/gnuplot + +# Makefile compiles directly into ${DESTDIR}${PREFIX} +SUBST_CLASSES+= optimize +SUBST_STAGE.optimize= pre-configure +SUBST_SED.optimize+= -e 's|-g3||g' +SUBST_SED.optimize+= -e 's|-O3||g' +SUBST_SED.optimize+= -e 's|-O4||g' +SUBST_SED.optimize+= -e 's|-funroll-loops||g' +SUBST_SED.optimize+= -e 's|-fexpensive-optimizations||g' +SUBST_SED.optimize+= -e 's|-finline-functions||g' +SUBST_FILES.optimize+= ${WRKSRC}/Makefile + +SUBST_CLASSES+= jar +SUBST_STAGE.jar= pre-configure +SUBST_SED.jar+= -e 's|\\$$bin/mhap-|${PREFIX}/${JAVAJARDIR}/mhap-|g' +SUBST_FILES.jar+= ${WRKSRC}/pipelines/canu/OverlapMhap.pm + +SUBST_CLASSES+= perl +SUBST_STAGE.perl= pre-configure +SUBST_SED.perl+= -e 's|$$FindBin::RealBin/../lib/site_perl|${PREFIX}/${SITE_PERL_REL}|g' +SUBST_FILES.perl+= ${WRKSRC}/pipelines/canu.pl + +REPLACE_PERL+= bogart/*.pl +REPLACE_PERL+= pipelines/*.pl +REPLACE_PERL+= bogus/*.pl +REPLACE_PERL+= bogart-analysis/*.pl +REPLACE_PERL+= overlapBasedTrimming/*.pl +REPLACE_PERL+= overlapInCore-analysis/*.pl +REPLACE_PERL+= merTrim/*.pl +REPLACE_PERL+= erateEstimate/*.pl +REPLACE_PERL+= meryl/.pl +REPLACE_PERL+= fastq-utilities/*.pl +REPLACE_PERL+= *.pl + +# May work on other 64-bit processors, but untested +ONLY_FOR_PLATFORM= *-*-x86_64 + +USE_LANGUAGES= c c++ +USE_JAVA= run +USE_JAVA2= 8 +USE_TOOLS+= gmake pax perl +GCC_REQD+= 4.8 + +WRKSRC= ${WRKDIR}/canu-${PKGVERSION_NOREV}/src +MAKE_ENV+= DESTDIR=${WRKSRC} + +TMP_INST= ${WRKSRC}${PREFIX}/${OPSYS}-${MACHINE_ARCH:S/x86_64/amd64/} +SITE_PERL_REL= lib/perl5/site_perl +JAVAJARDIR= share/java/classes +INSTALLATION_DIRS= bin ${JAVAJARDIR} + +post-extract: + ${CHMOD} -R g-w ${WRKDIR} + +post-build: + ${MKDIR} ${TMP_INST}/lib/perl5 + ${MV} ${TMP_INST}/lib/site_perl ${TMP_INST}/lib/perl5 + ${RM} -f ${TMP_INST}/bin/canu.defaults + +do-install: + cd ${TMP_INST}/bin && ${PAX} -wr * ${DESTDIR}${PREFIX}/bin + cd ${TMP_INST}/lib/perl5 && ${PAX} -wr * ${DESTDIR}${PREFIX}/lib/perl5 + cd ${TMP_INST}/share && ${PAX} -wr * ${DESTDIR}${PREFIX}/share + +.include "../../devel/boost-libs/buildlink3.mk" +.include "../../lang/perl5/module.mk" +.include "../../mk/bsd.pkg.mk" diff --git a/biology/canu/PLIST b/biology/canu/PLIST new file mode 100644 index 000000000000..1f88345abe09 --- /dev/null +++ b/biology/canu/PLIST @@ -0,0 +1,80 @@ +@comment $NetBSD: PLIST,v 1.1 2019/01/07 02:33:17 bacon Exp $ +bin/alignGFA +bin/bogart +bin/bogus +bin/canu +bin/correctOverlaps +bin/dumpBlob +bin/edalign +bin/errorEstimate +bin/falconsense +bin/fastqAnalyze +bin/fastqSample +bin/fastqSimulate +bin/fastqSimulate-sort +bin/filterCorrectionLayouts +bin/filterCorrectionOverlaps +bin/findErrors +bin/findErrors-Dump +bin/generateCorrectionLayouts +bin/loadCorrectedReads +bin/loadErates +bin/loadTrimmedReads +bin/meryl +bin/mhapConvert +bin/mmapConvert +bin/ovStoreBucketizer +bin/ovStoreBuild +bin/ovStoreConfig +bin/ovStoreDump +bin/ovStoreIndexer +bin/ovStoreSorter +bin/ovStoreStats +bin/overlapConvert +bin/overlapImport +bin/overlapInCore +bin/overlapInCorePartition +bin/overlapPair +bin/prefixEditDistance-matchLimitGenerate +bin/readConsensus +bin/sequence +bin/splitHaplotype +bin/splitReads +bin/sqStoreCreate +bin/sqStoreCreatePartition +bin/sqStoreDumpFASTQ +bin/sqStoreDumpMetaData +bin/tgStoreCompress +bin/tgStoreCoverageStat +bin/tgStoreDump +bin/tgStoreFilter +bin/tgStoreLoad +bin/tgTigDisplay +bin/trimReads +bin/utgcns +bin/wtdbgConvert +lib/perl5/site_perl/canu/Configure.pm +lib/perl5/site_perl/canu/Consensus.pm +lib/perl5/site_perl/canu/CorrectReads.pm +lib/perl5/site_perl/canu/Defaults.pm +lib/perl5/site_perl/canu/Execution.pm +lib/perl5/site_perl/canu/Grid.pm +lib/perl5/site_perl/canu/Grid_Cloud.pm +lib/perl5/site_perl/canu/Grid_DNANexus.pm +lib/perl5/site_perl/canu/Grid_LSF.pm +lib/perl5/site_perl/canu/Grid_PBSTorque.pm +lib/perl5/site_perl/canu/Grid_SGE.pm +lib/perl5/site_perl/canu/Grid_Slurm.pm +lib/perl5/site_perl/canu/HaplotypeReads.pm +lib/perl5/site_perl/canu/Meryl.pm +lib/perl5/site_perl/canu/Output.pm +lib/perl5/site_perl/canu/OverlapBasedTrimming.pm +lib/perl5/site_perl/canu/OverlapErrorAdjustment.pm +lib/perl5/site_perl/canu/OverlapInCore.pm +lib/perl5/site_perl/canu/OverlapMMap.pm +lib/perl5/site_perl/canu/OverlapMhap.pm +lib/perl5/site_perl/canu/OverlapStore.pm +lib/perl5/site_perl/canu/Report.pm +lib/perl5/site_perl/canu/SequenceStore.pm +lib/perl5/site_perl/canu/Unitig.pm +share/java/classes/mhap-2.1.3.jar diff --git a/biology/canu/distinfo b/biology/canu/distinfo new file mode 100644 index 000000000000..aa0c188c6367 --- /dev/null +++ b/biology/canu/distinfo @@ -0,0 +1,10 @@ +$NetBSD: distinfo,v 1.1 2019/01/07 02:33:17 bacon Exp $ + +SHA1 (canu-1.8.tar.gz) = 7dd79415aa5ecb95f05109f0d8d58f7cbfc336e9 +RMD160 (canu-1.8.tar.gz) = 78d4872b4034f526037ce225c699debd910bd586 +SHA512 (canu-1.8.tar.gz) = 650bc96675f371596f8e7748d4ab2d229f0262bf84cee8fed59af43d534d76095a72e4ba0b4a5ce9f561992268c317964cda2f6c89ee514f4920e2ba47fbc86c +Size (canu-1.8.tar.gz) = 2465314 bytes +SHA1 (patch-Makefile) = aa83003677cbb12558e438c776402ec48df0598d +SHA1 (patch-pipelines_canu_Defaults.pm) = 55a4631d86abb1881b0cc997514d44c536209ea6 +SHA1 (patch-pipelines_canu_Execution.pm) = fbb080c06ea5d2393d1835a61771715d2aef0274 +SHA1 (patch-utility_objectStore.C) = c8407de79abbaf296f027f704080cc7e878b85f4 diff --git a/biology/canu/patches/patch-Makefile b/biology/canu/patches/patch-Makefile new file mode 100644 index 000000000000..d3aa3323d49f --- /dev/null +++ b/biology/canu/patches/patch-Makefile @@ -0,0 +1,48 @@ +$NetBSD: patch-Makefile,v 1.1 2019/01/07 02:33:17 bacon Exp $ + +# Template Makefile requires platform-specific defaults for each OS +# To be sent upstream following commit + +--- Makefile.orig 2018-10-22 16:47:31.000000000 +0000 ++++ Makefile +@@ -545,6 +545,40 @@ endif + endif + + ++ifeq (${OSTYPE}, NetBSD) ++ CC ?= gcc ++ CXX ?= g++ ++ ++ # GCC ++ CXXFLAGS += -pthread -fopenmp -fPIC ++ LDFLAGS += -pthread -fopenmp -lm -lexecinfo ++ ++ #CXXFLAGS += -Wall -Wextra -Wformat -Wno-unused -Wno-parentheses ++ CXXFLAGS += -Wall -Wextra -Wformat -Wno-unused-function -Wno-unused-parameter -Wno-unused-variable -Wno-char-subscripts -Wno-write-strings -Wno-sign-compare -Wno-format-truncation ++ ++ # Google Performance Tools malloc and heapchecker (HEAPCHECK=normal) ++ #CXXFLAGS += ++ #LDFLAGS += -ltcmalloc ++ ++ # Google Performance Tools cpu profiler (CPUPROFILE=/path) ++ #CXXFLAGS += ++ #LDFLAGS += -lprofiler ++ ++ # callgrind ++ #CXXFLAGS += -g3 -Wa,--gstabs -save-temps ++ ++ ifeq ($(BUILDOPTIMIZED), 1) ++ else ++ CXXFLAGS += -g3 ++ endif ++ ++ ifeq ($(BUILDDEBUG), 1) ++ else ++ CXXFLAGS += -O3 -finline-functions -fomit-frame-pointer ++ endif ++endif ++ ++ + ifneq (,$(findstring CYGWIN, ${OSTYPE})) + CC ?= gcc + CXX ?= g++ diff --git a/biology/canu/patches/patch-pipelines_canu_Defaults.pm b/biology/canu/patches/patch-pipelines_canu_Defaults.pm new file mode 100644 index 000000000000..aabd51f3ca06 --- /dev/null +++ b/biology/canu/patches/patch-pipelines_canu_Defaults.pm @@ -0,0 +1,24 @@ +$NetBSD: patch-pipelines_canu_Defaults.pm,v 1.1 2019/01/07 02:33:17 bacon Exp $ + +# Add resource limits for SLURM +# Upstream is considering a scheduler-independent approach to this feature + +--- pipelines/canu/Defaults.pm.orig 2018-06-22 08:20:52.000000000 +0000 ++++ pipelines/canu/Defaults.pm +@@ -812,6 +812,16 @@ sub setDefaults () { + setDefault("gridEngineArraySubmitID", undef, "Grid engine configuration, not documented"); + setDefault("gridEngineJobID", undef, "Grid engine configuration, not documented"); + ++ ##### Slurm-specific parameters for controlling the number of ++ ##### cores / tasks dispatched per step or globally (WIP) ++ ++ setDefault( 'slurmCormhapCoreLimit', undef, 'Maximum number of cores allocated for MHAP pre-computing and alignment within the correction phase' ); ++ setDefault( 'slurmOvbCoreLimit', undef, 'Maximum number of single-core tasks dispatched for the ovlStore bucketizing step within the trimming phase' ); ++ setDefault( 'slurmOvsCoreLimit', undef, 'Maximum number of single-core tasks dispatched for the ovlStore sorting step within the trimming phase' ); ++ setDefault( 'slurmRedCoreLimit', undef, 'Maximum number of cores allocated for read error detection within the unitigging phase' ); ++ setDefault( 'slurmArrayTaskLimit', undef, 'Maximum number of tasks permitted for each step throughout assembly' ); ++ setDefault( 'slurmArrayCoreLimit', undef, 'Maximum number of cores allocated for each step throughout assembly' ); ++ + ##### Grid Engine Pipeline + + setDefault("useGrid", 1, "If 'true', enable grid-based execution; if 'false', run all jobs on the local machine; if 'remote', create jobs for grid execution but do not submit; default 'true'"); diff --git a/biology/canu/patches/patch-pipelines_canu_Execution.pm b/biology/canu/patches/patch-pipelines_canu_Execution.pm new file mode 100644 index 000000000000..30f37359990c --- /dev/null +++ b/biology/canu/patches/patch-pipelines_canu_Execution.pm @@ -0,0 +1,83 @@ +$NetBSD: patch-pipelines_canu_Execution.pm,v 1.1 2019/01/07 02:33:17 bacon Exp $ + +# Add resource limits for SLURM +# Upstream is considering a scheduler-independent approach to this feature + +--- pipelines/canu/Execution.pm.orig 2018-06-22 08:20:52.000000000 +0000 ++++ pipelines/canu/Execution.pm +@@ -303,10 +303,6 @@ sub skipStage ($$@) { + sub getInstallDirectory () { + my $installDir = $FindBin::RealBin; + +- if ($installDir =~ m!^(.*)/\w+-\w+/bin$!) { +- $installDir = $1; +- } +- + return($installDir); + } + +@@ -694,8 +690,8 @@ sub submitScript ($$) { + + + +-sub buildGridArray ($$$$) { +- my ($name, $bgn, $end, $opt) = @_; ++sub buildGridArray (@) { ++ my ( $name, $bgn, $end, $opt, $thr ) = @_; + my $off = 0; + + # In some grids (SGE) this is the maximum size of an array job. +@@ -725,8 +721,42 @@ sub buildGridArray ($$$$) { + $off = "-F \"$off\""; + } + +- $opt =~ s/ARRAY_NAME/$name/g; # Replace ARRAY_NAME with 'job name' +- $opt =~ s/ARRAY_JOBS/$bgn-$end/g; # Replace ARRAY_JOBS with 'bgn-end' ++ if( $opt =~ m/(ARRAY_NAME)/ ) ++ { ++ $opt =~ s/$1/$name/; # Replace ARRAY_NAME with 'job name' ++ } ++ elsif( $opt =~ m/(ARRAY_JOBS)/ ) ++ { ++ $opt =~ s/$1/$bgn-$end/; # Replace ARRAY_JOBS with 'bgn-end' ++ ++ if( lc( getGlobal( 'gridEngine' ) ) eq 'slurm' && $end > 1 ) ++ { ++ if( $name =~ m/^cormhap_/i && defined getGlobal( 'slurmCormhapCoreLimit' ) ) ++ { ++ $opt .= '%' . int( getGlobal( 'slurmCormhapCoreLimit' ) / $thr ); ++ } ++ elsif( $name =~ m/^ovb_/i && defined getGlobal( 'slurmOvbCoreLimit' ) ) ++ { ++ $opt .= '%' . getGlobal( 'slurmOvbCoreLimit' ); ++ } ++ elsif( $name =~ m/^ovs_/i && defined getGlobal( 'slurmOvsCoreLimit' ) ) ++ { ++ $opt .= '%' . getGlobal( 'slurmOvsCoreLimit' ); ++ } ++ elsif( $name =~ m/^red_/i && defined getGlobal( 'slurmRedCoreLimit' ) ) ++ { ++ $opt .= '%' . int( getGlobal( 'slurmRedCoreLimit' ) / $thr ); ++ } ++ elsif( defined getGlobal( 'slurmArrayTaskLimit' ) ) ++ { ++ $opt .= '%' . getGlobal( 'slurmArrayTaskLimit' ); ++ } ++ elsif( defined getGlobal( 'slurmArrayCoreLimit' ) ) ++ { ++ $opt .= '%' . int( getGlobal( 'slurmArrayCoreLimit' ) / $thr ); ++ } ++ } ++ } + + return($opt, $off); + } +@@ -870,7 +900,7 @@ sub buildGridJob ($$$$$$$$$) { + my $jobNameT = makeUniqueJobName($jobType, $asm); + + my ($jobName, $jobOff) = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayName")); +- my ($arrayOpt, $arrayOff) = buildGridArray($jobNameT, $bgnJob, $endJob, getGlobal("gridEngineArrayOption")); ++ my ( $arrayOpt, $arrayOff ) = buildGridArray( $jobNameT, $bgnJob, $endJob, getGlobal( "gridEngineArrayOption" ), $thr ); + + my $outputOption = buildOutputOption($path, $script); + diff --git a/biology/canu/patches/patch-utility_objectStore.C b/biology/canu/patches/patch-utility_objectStore.C new file mode 100644 index 000000000000..8c4af7f90919 --- /dev/null +++ b/biology/canu/patches/patch-utility_objectStore.C @@ -0,0 +1,19 @@ +$NetBSD: patch-utility_objectStore.C,v 1.1 2019/01/07 02:33:17 bacon Exp $ + +# NetBSD does not provide WEXITED +# Empty bitmask should suffice in this case + +--- utility/objectStore.C.orig 2018-12-23 01:57:06.000000000 +0000 ++++ utility/objectStore.C +@@ -286,6 +286,11 @@ fetchFromObjectStore(char *requested) { + // Otherwise, we're still the parent, so wait for the (-1 == any) child + // process to terminate. + ++// NetBSD does not provide WEXITED so send empty bitmask ++#ifdef __NetBSD__ ++#define WEXITED 0 ++#endif ++ + waitpid(-1, &err, WEXITED); + + if ((WIFEXITED(err)) &&