diff --git a/capnp/serialize/defs.capnp b/capnp/serialize/defs.capnp index c4b7c4c..86fd414 100644 --- a/capnp/serialize/defs.capnp +++ b/capnp/serialize/defs.capnp @@ -3,6 +3,8 @@ using Cxx = import "/capnp/c++.capnp"; $Cxx.namespace("GLnexus::capnp"); +### discovered alleles + struct DiscoveredAlleleInfo { isRef @0 :Bool = false; @@ -38,3 +40,29 @@ struct DiscoveredAlleles { contigs @1 : List(Contig); aips @2 :List(AlleleInfoPair); } + +### unified sites + +struct OriginalAllele { + pos @0 :Range; + dna @1 :Text; + unifiedAllele @2 :Int64; +} + +struct UnifiedSite { + pos @0 :Range; + containingTargetOption :union { + noContainingTarget @1 :Void; + containingTarget @2 :Range; + } + alleles @3 :List(Text); + unification @4 :List(OriginalAllele); + alleleFrequencies @5 :List(Float32); + lostAlleleFrequency @6 :Float32; + qual @7 :Int64; + monoallelic @8 :Bool; +} + +struct UnifiedSites { + sites @0 :List(UnifiedSite); +} diff --git a/cli/glnexus_cli.cc b/cli/glnexus_cli.cc index ef337d5..53bcf8d 100644 --- a/cli/glnexus_cli.cc +++ b/cli/glnexus_cli.cc @@ -120,6 +120,12 @@ static int all_steps(const vector &vcf_files, console->info() << "Writing unified sites as YAML to " << filename; H("write unified sites to file", GLnexus::cli::utils::write_unified_sites_to_file(sites, contigs, filename)); + + string filename_cflat("/tmp/sites.cflat"); + console->info() << "Writing unified sites in serialized capnproto to " << filename + << ", verifying that it is readable"; + H("Verify that the cflat file is readable", + GLnexus::capnp_unified_sites_verify(sites, filename_cflat)); } // genotype diff --git a/include/types.h b/include/types.h index 6b1b9cd..93c9b87 100644 --- a/include/types.h +++ b/include/types.h @@ -520,6 +520,21 @@ struct unified_site { unified_site& ans); }; +// write vector structure to a file, with cap'n proto serialization +Status capnp_of_unified_sites(const std::vector &sites, const std::string &filename); + +// write unified sites capnp to a file descriptor. +Status capnp_of_unified_sites_fd(const std::vector &sites, int fd); + +// read unified sites from a capnp file +Status unified_sites_of_capnp(const std::string &filename, std::vector& sites); + +// Verify that we can serialize and deserialize unified sites +// +// Note: this is a debugging function +Status capnp_unified_sites_verify(const std::vector &sites, const std::string &filename); + + // Statistics collected during range queries struct StatsRangeQuery { int64_t nBCFRecordsRead; // how many BCF records were read from the DB diff --git a/src/types.cc b/src/types.cc index 16bd7e6..38823ee 100644 --- a/src/types.cc +++ b/src/types.cc @@ -600,6 +600,167 @@ Status unified_site::of_yaml(const YAML::Node& yaml, const vector& sites, int fd) { + ::capnp::MallocMessageBuilder message; + auto us_b = message.initRoot(); + + auto sites_b = us_b.initSites(sites.size()); + for (int i = 0; i < sites.size(); i++) { + const auto& site = sites[i]; + auto site_b = sites_b[i]; + int j; + + auto pos_b = site_b.initPos(); + pos_b.setRid(site.pos.rid); + pos_b.setBeg(site.pos.beg); + pos_b.setEnd(site.pos.end); + + if (site.containing_target.rid > -1) { + auto ct_b = site_b.getContainingTargetOption().initContainingTarget(); + ct_b.setRid(site.containing_target.rid); + ct_b.setBeg(site.containing_target.beg); + ct_b.setEnd(site.containing_target.end); + } else { + site_b.getContainingTargetOption().setNoContainingTarget(::capnp::VOID); + } + + auto alleles_b = site_b.initAlleles(site.alleles.size()); + for (j = 0; j < site.alleles.size(); j++) { + alleles_b.set(j, site.alleles[j]); + } + + auto unification_b = site_b.initUnification(site.unification.size()); + j = 0; + for (const auto& p : site.unification) { + auto oa_b = unification_b[j++]; + auto oa_pos_b = oa_b.initPos(); + oa_pos_b.setRid(p.first.pos.rid); + oa_pos_b.setBeg(p.first.pos.beg); + oa_pos_b.setEnd(p.first.pos.end); + oa_b.setDna(p.first.dna); + oa_b.setUnifiedAllele(p.second); + } + + auto allele_frequencies_b = site_b.initAlleleFrequencies(site.allele_frequencies.size()); + for (j = 0; j < site.allele_frequencies.size(); j++) { + allele_frequencies_b.set(j, site.allele_frequencies[j]); + } + + site_b.setLostAlleleFrequency(site.lost_allele_frequency); + site_b.setQual(site.qual); + site_b.setMonoallelic(site.monoallelic); + } + + // Capnp throws exceptions on errors, and only in extreme cases. + writePackedMessageToFd(fd, message); + return Status::OK(); +} + + +Status capnp_of_unified_sites_fd(const vector &sites, int fd) { + try { + return capnp_write_unified_sites_fd(sites, fd); + } catch (exception &e) { + return Status::IOError("Capnproto error during de-serialization", e.what()); + } +} + +Status capnp_of_unified_sites(const vector& sites, const std::string &filename) { + int fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU); + if (fd < 0) { + return Status::IOError("Could not open file for writing", filename); + } + Status s = capnp_of_unified_sites_fd(sites, fd); + if (close(fd) != 0) { + return Status::IOError("file close failed", filename); + } + return s; +} + +// read unified_site list from a file-descriptor, as serialized by cap'n proto +static Status _capnp_read_unified_sites_fd(int fd, vector& sites) { + ::capnp::ReaderOptions ropt; + ropt.traversalLimitInWords = CAPNP_TRAVERSAL_LIMIT; + ::capnp::PackedFdMessageReader message(fd, ropt); + auto sites_r = message.getRoot(); + + sites.clear(); + for(const auto site_r : sites_r.getSites()) { + range pos(-1,-1,-1); + const auto pos_r = site_r.getPos(); + pos.rid = pos_r.getRid(); + pos.beg = pos_r.getBeg(); + pos.end = pos_r.getEnd(); + + unified_site site(pos); + + const auto cto_r = site_r.getContainingTargetOption(); + if (cto_r.hasContainingTarget()) { + const auto ct_r = cto_r.getContainingTarget(); + site.containing_target.rid = ct_r.getRid(); + site.containing_target.beg = ct_r.getBeg(); + site.containing_target.end = ct_r.getEnd(); + } + + for (const auto& al : site_r.getAlleles()) { + site.alleles.push_back(string(al)); + } + + for (const auto oa_r : site_r.getUnification()) { + range oa_pos(-1,-1,-1); + auto oa_pos_r = oa_r.getPos(); + oa_pos.rid = oa_pos_r.getRid(); + oa_pos.beg = oa_pos_r.getBeg(); + oa_pos.end = oa_pos_r.getEnd(); + + site.unification[allele(oa_pos, oa_r.getDna())] = oa_r.getUnifiedAllele(); + } + + for (const auto x : site_r.getAlleleFrequencies()) { + site.allele_frequencies.push_back(x); + } + + site.lost_allele_frequency = site_r.getLostAlleleFrequency(); + site.qual = site_r.getQual(); + site.monoallelic = site_r.getMonoallelic(); + + sites.push_back(std::move(site)); + } + + return Status::OK(); +} + +Status unified_sites_of_capnp(const std::string &filename, vector& sites) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd < 0) { + return Status::IOError("Could not open file for reading", filename); + } + Status s; + try { + // Capnp throws exceptions on errors, and only in extreme cases. + s = _capnp_read_unified_sites_fd(fd, sites); + } catch (exception &e) { + return Status::IOError("Capnproto error during de-serialization", e.what()); + } + if (close(fd) != 0) { + return Status::IOError("file close failed", filename); + } + return s; +} + +// verify roundtrip of sites +Status capnp_unified_sites_verify(const vector& sites, const string& filename) { + Status s; + vector sites2; + S(capnp_of_unified_sites(sites, filename)); + S(unified_sites_of_capnp(filename, sites2)); + if (sites == sites2) { + return Status::OK(); + } + return Status::Invalid("capnp serialization/deserialization of unified sites does not return original value"); +} + Status unifier_config::of_yaml(const YAML::Node& yaml, unifier_config& ans) { Status s; diff --git a/test/gvcf_test_cases.cc b/test/gvcf_test_cases.cc index fcb9a89..0f02ad3 100644 --- a/test/gvcf_test_cases.cc +++ b/test/gvcf_test_cases.cc @@ -231,6 +231,7 @@ class GVCFTestCase { const auto n_unified_sites = yaml["truth_unified_sites"]; REQUIRE(n_unified_sites); S(parse_unified_sites(n_unified_sites, contigs)); + REQUIRE(GLnexus::capnp_unified_sites_verify(truth_sites, "/tmp/unified_sites_capnp_check").ok()); } // write truth gvcf if test_genotypes is set to true diff --git a/test/types.cc b/test/types.cc index 96d11d4..a0fa93e 100644 --- a/test/types.cc +++ b/test/types.cc @@ -345,13 +345,14 @@ quality: 100 SECTION("snp+del") { vector ns = YAML::LoadAll("---\n" + string(snp) + "\n---\n" + string(del) + "\n..."); REQUIRE(ns.size() == 2); - unified_site us(range(-1,-1,-1)); + unified_site us(range(-1,-1,-1)), us2(range(-1,-1,-1)); Status s = unified_site::of_yaml(ns[0], contigs, us); REQUIRE(s.ok()); VERIFY_SNP(us); - s = unified_site::of_yaml(ns[1], contigs, us); + s = unified_site::of_yaml(ns[1], contigs, us2); REQUIRE(s.ok()); - VERIFY_DEL(us); + VERIFY_DEL(us2); + REQUIRE(GLnexus::capnp_unified_sites_verify({us, us2}, "/tmp/unified_sites.capnp").ok()); } SECTION("optional ref") { @@ -497,6 +498,8 @@ monoallelic: false unified_site us2(range(-1,-1,-1)); REQUIRE(unified_site::of_yaml(n, contigs, us2).ok()); REQUIRE(us == us2); + + REQUIRE(GLnexus::capnp_unified_sites_verify({us}, "/tmp/unified_sites.capnp").ok()); } }