From f86067952538ad49d5f6085e830cae223a8fe550 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Thu, 1 Jun 2017 23:24:24 -0700 Subject: [PATCH 1/9] Use ejson body instead of compressed body for external size In two places where we calculate the ExternalSize of the document body, we use the Summary which is a compressed version of the doc body. We change this to use the actual ejson body. In copy_docs we don't have access to the #doc record so we can't access the meta where we store the ejson body. Unfortunately, this means we have to decompress the document body after reading it from disk. COUCHDB-3429 --- src/couch/src/couch_db.erl | 4 +++- src/couch/src/couch_db_updater.erl | 23 +++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl index d01a3e0c43..3222bc50b1 100644 --- a/src/couch/src/couch_db.erl +++ b/src/couch/src/couch_db.erl @@ -1111,7 +1111,9 @@ prepare_doc_summaries(Db, BucketList) -> nil end, SummaryChunk = couch_db_updater:make_doc_summary(Db, {Body, DiskAtts}), - Doc#doc{body = {summary, SummaryChunk, SizeInfo, AttsFd}} + Meta = Doc#doc.meta, + Doc#doc{body = {summary, SummaryChunk, SizeInfo, AttsFd}, + meta = [{ejson_body, Body} | Meta]} end, Bucket) || Bucket <- BucketList]. diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 49061b2f64..937901b17f 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -677,6 +677,7 @@ flush_trees(#db{fd = Fd} = Db, case Value of #doc{deleted = IsDeleted, body = {summary, _, _, _} = DocSummary} -> {summary, Summary, AttSizeInfo, AttsFd} = DocSummary, + EJsonBody = get_body_from_meta(Value#doc.meta, Summary), % this node value is actually an unwritten document summary, % write to disk. % make sure the Fd in the written bins is the same Fd we are @@ -695,7 +696,7 @@ flush_trees(#db{fd = Fd} = Db, " changed. Possibly retrying.", []), throw(retry) end, - ExternalSize = ?term_size(Summary), + ExternalSize = ?term_size(EJsonBody), {ok, NewSummaryPointer, SummarySize} = couch_file:append_raw_chunk(Fd, Summary), Leaf = #leaf{ @@ -1076,6 +1077,7 @@ check_md5(_, _) -> throw(md5_mismatch). copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> DocInfoIds = [Id || #doc_info{id=Id} <- MixedInfos], + Compress = NewDb#db.compression, LookupResults = couch_btree:lookup(Db#db.id_tree, DocInfoIds), % COUCHDB-968, make sure we prune duplicates during compaction NewInfos0 = lists:usort(fun(#full_doc_info{id=A}, #full_doc_info{id=B}) -> @@ -1086,8 +1088,15 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) -> {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd), + IsComp = couch_compress:is_compressed(Body, Compress), + EJsonBody = case IsComp of + true -> + couch_compress:decompress(Body); + false -> + Body + end, SummaryChunk = make_doc_summary(NewDb, {Body, AttInfos}), - ExternalSize = ?term_size(SummaryChunk), + ExternalSize = ?term_size(EJsonBody), {ok, Pos, SummarySize} = couch_file:append_raw_chunk( DestFd, SummaryChunk), AttSizes = [{element(3,A), element(4,A)} || A <- AttInfos], @@ -1467,6 +1476,16 @@ make_doc_summary(#db{compression = Comp}, {Body0, Atts0}) -> SummaryBin = ?term_to_bin({Body, Atts}), couch_file:assemble_file_chunk(SummaryBin, couch_crypto:hash(md5, SummaryBin)). + +get_body_from_meta(Meta, Summary) -> + case lists:keyfind(ejson_body, 1, Meta) of + {ejson_body, Body} -> + Body; + false -> + couch_compress:decompress(Summary) + end. + + default_security_object(<<"shards/", _/binary>>) -> case config:get("couchdb", "default_security", "everyone") of "admin_only" -> From c2e2f237d049d3059e11b90c15994194930c850f Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 26 Jun 2017 18:34:02 -0700 Subject: [PATCH 2/9] Use size in meta instead of Body COUCHDB-3429 --- src/couch/src/couch_db.erl | 3 ++- src/couch/src/couch_db_updater.erl | 11 +++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl index 3222bc50b1..65cd26ee5b 100644 --- a/src/couch/src/couch_db.erl +++ b/src/couch/src/couch_db.erl @@ -1112,8 +1112,9 @@ prepare_doc_summaries(Db, BucketList) -> end, SummaryChunk = couch_db_updater:make_doc_summary(Db, {Body, DiskAtts}), Meta = Doc#doc.meta, + ExternalSize = ?term_size(Body), Doc#doc{body = {summary, SummaryChunk, SizeInfo, AttsFd}, - meta = [{ejson_body, Body} | Meta]} + meta = [{ejson_size, ExternalSize} | Meta]} end, Bucket) || Bucket <- BucketList]. diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 937901b17f..6d09eb88ea 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -677,7 +677,7 @@ flush_trees(#db{fd = Fd} = Db, case Value of #doc{deleted = IsDeleted, body = {summary, _, _, _} = DocSummary} -> {summary, Summary, AttSizeInfo, AttsFd} = DocSummary, - EJsonBody = get_body_from_meta(Value#doc.meta, Summary), + ExternalSize = get_meta_body_size(Value#doc.meta, Summary), % this node value is actually an unwritten document summary, % write to disk. % make sure the Fd in the written bins is the same Fd we are @@ -696,7 +696,6 @@ flush_trees(#db{fd = Fd} = Db, " changed. Possibly retrying.", []), throw(retry) end, - ExternalSize = ?term_size(EJsonBody), {ok, NewSummaryPointer, SummarySize} = couch_file:append_raw_chunk(Fd, Summary), Leaf = #leaf{ @@ -1477,10 +1476,10 @@ make_doc_summary(#db{compression = Comp}, {Body0, Atts0}) -> couch_file:assemble_file_chunk(SummaryBin, couch_crypto:hash(md5, SummaryBin)). -get_body_from_meta(Meta, Summary) -> - case lists:keyfind(ejson_body, 1, Meta) of - {ejson_body, Body} -> - Body; +get_meta_body_size(Meta, Summary) -> + case lists:keyfind(ejson_size, 1, Meta) of + {ejson_size, ExternalSize} -> + ExternalSize; false -> couch_compress:decompress(Summary) end. From 64d848c9e46cceba6a25e55c7a5fefad827c2942 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 26 Jun 2017 18:39:34 -0700 Subject: [PATCH 3/9] use is_binary couchdb-3429 --- src/couch/src/couch_db_updater.erl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 6d09eb88ea..eee72eea15 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -1087,8 +1087,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) -> {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd), - IsComp = couch_compress:is_compressed(Body, Compress), - EJsonBody = case IsComp of + EJsonBody = case is_binary(Body) of true -> couch_compress:decompress(Body); false -> From 7fec381a7fd79dff6dac9fefa308ccc886cdf567 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 26 Jun 2017 22:03:11 -0700 Subject: [PATCH 4/9] Add test to make for different compression methods Couchdb-3429 --- src/couch/test/couchdb_file_compression_tests.erl | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/couch/test/couchdb_file_compression_tests.erl b/src/couch/test/couchdb_file_compression_tests.erl index a91a92447f..5fcb4675b0 100644 --- a/src/couch/test/couchdb_file_compression_tests.erl +++ b/src/couch/test/couchdb_file_compression_tests.erl @@ -115,12 +115,14 @@ compare_compression_methods(DbName) -> compact_view(DbName), DbSizeNone = db_disk_size(DbName), ViewSizeNone = view_disk_size(DbName), + ExternalSizeNone = db_external_size(DbName), config:set("couchdb", "file_compression", "snappy", false), compact_db(DbName), compact_view(DbName), DbSizeSnappy = db_disk_size(DbName), ViewSizeSnappy = view_disk_size(DbName), + ExternalSizeSnappy = db_external_size(DbName), ?assert(DbSizeNone > DbSizeSnappy), ?assert(ViewSizeNone > ViewSizeSnappy), @@ -139,9 +141,12 @@ compare_compression_methods(DbName) -> compact_view(DbName), DbSizeDeflate9 = db_disk_size(DbName), ViewSizeDeflate9 = view_disk_size(DbName), + ExternalSizeDeflate9 = db_external_size(DbName), ?assert(DbSizeDeflate1 > DbSizeDeflate9), - ?assert(ViewSizeDeflate1 > ViewSizeDeflate9). + ?assert(ViewSizeDeflate1 > ViewSizeDeflate9), + ?assert(ExternalSizeNone =:= ExternalSizeSnappy), + ?assert(ExternalSizeNone =:= ExternalSizeDeflate9). populate_db(_Db, NumDocs) when NumDocs =< 0 -> @@ -186,6 +191,12 @@ db_disk_size(DbName) -> ok = couch_db:close(Db), active_size(Info). +db_external_size(DbName) -> + {ok, Db} = couch_db:open_int(DbName, []), + {ok, Info} = couch_db:get_db_info(Db), + ok = couch_db:close(Db), + external_size(Info). + view_disk_size(DbName) -> {ok, Db} = couch_db:open_int(DbName, []), {ok, DDoc} = couch_db:open_doc(Db, ?DDOC_ID, [ejson_body]), @@ -196,6 +207,9 @@ view_disk_size(DbName) -> active_size(Info) -> couch_util:get_nested_json_value({Info}, [sizes, active]). +external_size(Info) -> + couch_util:get_nested_json_value({Info}, [sizes, external]). + wait_compaction(DbName, Kind, Line) -> WaitFun = fun() -> case is_compaction_running(DbName) of From a8b220014a45109d4ed1e46e89b011ead9a0925c Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 26 Jun 2017 22:19:11 -0700 Subject: [PATCH 5/9] add pre-compaction external size value couchdb-3429 --- src/couch/test/couchdb_file_compression_tests.erl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/couch/test/couchdb_file_compression_tests.erl b/src/couch/test/couchdb_file_compression_tests.erl index 5fcb4675b0..1567f3bd64 100644 --- a/src/couch/test/couchdb_file_compression_tests.erl +++ b/src/couch/test/couchdb_file_compression_tests.erl @@ -111,6 +111,7 @@ should_compare_compression_methods(DbName) -> compare_compression_methods(DbName) -> config:set("couchdb", "file_compression", "none", false), + ExternalSizePreCompact = db_external_size(DbName), compact_db(DbName), compact_view(DbName), DbSizeNone = db_disk_size(DbName), @@ -145,10 +146,10 @@ compare_compression_methods(DbName) -> ?assert(DbSizeDeflate1 > DbSizeDeflate9), ?assert(ViewSizeDeflate1 > ViewSizeDeflate9), + ?assert(ExternalSizePreCompact =:= ExternalSizeNone), ?assert(ExternalSizeNone =:= ExternalSizeSnappy), ?assert(ExternalSizeNone =:= ExternalSizeDeflate9). - populate_db(_Db, NumDocs) when NumDocs =< 0 -> ok; populate_db(Db, NumDocs) -> From 8b825571118b82bc8853072b9cb69cfba7237259 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Mon, 26 Jun 2017 22:24:18 -0700 Subject: [PATCH 6/9] whitespace --- src/couch/test/couchdb_file_compression_tests.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/couch/test/couchdb_file_compression_tests.erl b/src/couch/test/couchdb_file_compression_tests.erl index 1567f3bd64..b208ac0c19 100644 --- a/src/couch/test/couchdb_file_compression_tests.erl +++ b/src/couch/test/couchdb_file_compression_tests.erl @@ -150,6 +150,7 @@ compare_compression_methods(DbName) -> ?assert(ExternalSizeNone =:= ExternalSizeSnappy), ?assert(ExternalSizeNone =:= ExternalSizeDeflate9). + populate_db(_Db, NumDocs) when NumDocs =< 0 -> ok; populate_db(Db, NumDocs) -> From b3eb718098f1b5e64c823e9b6d515764ca7841e5 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Wed, 12 Jul 2017 10:56:25 -0700 Subject: [PATCH 7/9] cleanup and formatting --- src/couch/src/couch_db.erl | 7 ++++--- src/couch/src/couch_db_updater.erl | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl index 65cd26ee5b..e4e3a8b349 100644 --- a/src/couch/src/couch_db.erl +++ b/src/couch/src/couch_db.erl @@ -1112,9 +1112,10 @@ prepare_doc_summaries(Db, BucketList) -> end, SummaryChunk = couch_db_updater:make_doc_summary(Db, {Body, DiskAtts}), Meta = Doc#doc.meta, - ExternalSize = ?term_size(Body), - Doc#doc{body = {summary, SummaryChunk, SizeInfo, AttsFd}, - meta = [{ejson_size, ExternalSize} | Meta]} + Doc#doc{ + body = {summary, SummaryChunk, SizeInfo, AttsFd}, + meta = [{ejson_size, ?term_size(Body)} | Meta] + } end, Bucket) || Bucket <- BucketList]. diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index eee72eea15..26921bcfe7 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -1076,7 +1076,6 @@ check_md5(_, _) -> throw(md5_mismatch). copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> DocInfoIds = [Id || #doc_info{id=Id} <- MixedInfos], - Compress = NewDb#db.compression, LookupResults = couch_btree:lookup(Db#db.id_tree, DocInfoIds), % COUCHDB-968, make sure we prune duplicates during compaction NewInfos0 = lists:usort(fun(#full_doc_info{id=A}, #full_doc_info{id=B}) -> From c07d43775906fc5e0cb5aa7549ee31dbd6516db9 Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Wed, 12 Jul 2017 11:12:14 -0700 Subject: [PATCH 8/9] add comment --- src/couch/src/couch_db_updater.erl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 26921bcfe7..45468ebd85 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -1086,6 +1086,8 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> {NewRevTree, FinalAcc} = couch_key_tree:mapfold(fun (_Rev, #leaf{ptr=Sp}=Leaf, leaf, SizesAcc) -> {Body, AttInfos} = copy_doc_attachments(Db, Sp, DestFd), + % In the future, we should figure out how to do this for + % upgrade purposes. EJsonBody = case is_binary(Body) of true -> couch_compress:decompress(Body); From 2065100bab696d8dc6086835d10023d11320221c Mon Sep 17 00:00:00 2001 From: Tony Sun Date: Fri, 14 Jul 2017 10:13:08 -0700 Subject: [PATCH 9/9] fix missing term_size --- src/couch/src/couch_db_updater.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 45468ebd85..277f2b5352 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -1481,7 +1481,7 @@ get_meta_body_size(Meta, Summary) -> {ejson_size, ExternalSize} -> ExternalSize; false -> - couch_compress:decompress(Summary) + ?term_size(couch_compress:decompress(Summary)) end.