From c6b20e50efe1dd9f03e94a829b79d37db6db064a Mon Sep 17 00:00:00 2001 From: Bruno Baruffaldi Date: Sun, 7 Apr 2019 15:17:12 -0300 Subject: [PATCH 1/5] Delaying the creation of the document. Moving the creation of the document from omindex.cc to index_file.cc, and in this way we avoid creating it prematurely. --- xapian-applications/omega/index_file.cc | 16 +++++++++++++++- xapian-applications/omega/index_file.h | 2 +- xapian-applications/omega/omindex.cc | 17 +---------------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index f831acd1c14..f4a714e71f2 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -69,6 +69,7 @@ #include "xmlparse.h" #include "xlsxparse.h" #include "xpsxmlparse.h" +#include "hashterm.h" using namespace std; @@ -533,7 +534,7 @@ void index_mimetype(const string & file, const string & urlterm, const string & url, const string & ext, const string &mimetype, DirectoryIterator &d, - Xapian::Document & newdocument, + string & path_term, string record) { string context(file, root.size(), string::npos); @@ -567,6 +568,19 @@ index_mimetype(const string & file, const string & urlterm, const string & url, } if (verbose) cout << flush; + // Use `file` as the basis, as we don't want URL encoding in these terms, + // but need to switch over the initial part so we get `/~olly/foo/bar` not + // `/home/olly/public_html/foo/bar`. + Xapian::Document newdocument; + size_t j; + while ((j = path_term.rfind('/')) > 1 && j != string::npos) { + path_term.resize(j); + if (path_term.length() > MAX_SAFE_TERM_LENGTH) { + newdocument.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH)); + } else { + newdocument.add_boolean_term(path_term); + } + } string author, title, sample, keywords, topic, dump; string md5; diff --git a/xapian-applications/omega/index_file.h b/xapian-applications/omega/index_file.h index 479fc49f5b8..93c29c343c4 100644 --- a/xapian-applications/omega/index_file.h +++ b/xapian-applications/omega/index_file.h @@ -112,7 +112,7 @@ index_mimetype(const std::string & file, const std::string & urlterm, const std::string & url, const std::string & ext, const std::string &mimetype, DirectoryIterator &d, - Xapian::Document &doc, + std::string &path_term, std::string record); /// Delete any previously indexed documents we haven't seen. diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 7b81605d34e..9b13af72c53 100644 --- a/xapian-applications/omega/omindex.cc +++ b/xapian-applications/omega/omindex.cc @@ -175,26 +175,11 @@ index_file(const string &file, const string &url, DirectoryIterator & d, cout << "Indexing \"" << file.substr(root.size()) << "\" as " << mimetype << " ... "; - Xapian::Document new_doc; - - // Use `file` as the basis, as we don't want URL encoding in these terms, - // but need to switch over the initial part so we get `/~olly/foo/bar` not - // `/home/olly/public_html/foo/bar`. string path_term("P"); path_term += url_start_path; path_term.append(file, root.size(), string::npos); - size_t i; - while ((i = path_term.rfind('/')) > 1 && i != string::npos) { - path_term.resize(i); - if (path_term.length() > MAX_SAFE_TERM_LENGTH) { - new_doc.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH)); - } else { - new_doc.add_boolean_term(path_term); - } - } - - index_mimetype(file, urlterm, url, ext, mimetype, d, new_doc, string()); + index_mimetype(file, urlterm, url, ext, mimetype, d, path_term, string()); } static void From fc10c6dcea3719cc95aff44c78e0535a251b5a4f Mon Sep 17 00:00:00 2001 From: Bruno Baruffaldi Date: Sun, 7 Apr 2019 15:19:46 -0300 Subject: [PATCH 2/5] Delaying libmagic checks. The checks done by libmagic can be expensive since it has to open and read the file, so we should avoid it if cheap check could reject the file. --- xapian-applications/omega/index_file.cc | 18 ++++++++++++++++-- xapian-applications/omega/index_file.h | 2 +- xapian-applications/omega/omindex.cc | 15 --------------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index f4a714e71f2..a4954382beb 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -533,7 +533,7 @@ index_add_document(const string & urlterm, time_t last_altered, void index_mimetype(const string & file, const string & urlterm, const string & url, const string & ext, - const string &mimetype, DirectoryIterator &d, + string &mimetype, DirectoryIterator &d, string & path_term, string record) { @@ -567,7 +567,21 @@ index_mimetype(const string & file, const string & urlterm, const string & url, } } - if (verbose) cout << flush; + // If we didn't get the mime type from the extension, call libmagic to get + // it. + if (mimetype.empty()) { + mimetype = d.get_magic_mimetype(); + if (mimetype.empty()) { + skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format", + d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME); + return; + } + } + + if (verbose) + cout << "Indexing \"" << file.substr(root.size()) << "\" as " + << mimetype << " ... " << flush; + // Use `file` as the basis, as we don't want URL encoding in these terms, // but need to switch over the initial part so we get `/~olly/foo/bar` not // `/home/olly/public_html/foo/bar`. diff --git a/xapian-applications/omega/index_file.h b/xapian-applications/omega/index_file.h index 93c29c343c4..65cd219d00d 100644 --- a/xapian-applications/omega/index_file.h +++ b/xapian-applications/omega/index_file.h @@ -111,7 +111,7 @@ void index_mimetype(const std::string & file, const std::string & urlterm, const std::string & url, const std::string & ext, - const std::string &mimetype, DirectoryIterator &d, + std::string &mimetype, DirectoryIterator &d, std::string &path_term, std::string record); diff --git a/xapian-applications/omega/omindex.cc b/xapian-applications/omega/omindex.cc index 9b13af72c53..00a7946ef0f 100644 --- a/xapian-applications/omega/omindex.cc +++ b/xapian-applications/omega/omindex.cc @@ -160,21 +160,6 @@ index_file(const string &file, const string &url, DirectoryIterator & d, return; } - // If we didn't get the mime type from the extension, call libmagic to get - // it. - if (mimetype.empty()) { - mimetype = d.get_magic_mimetype(); - if (mimetype.empty()) { - skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format", - d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME); - return; - } - } - - if (verbose) - cout << "Indexing \"" << file.substr(root.size()) << "\" as " - << mimetype << " ... "; - string path_term("P"); path_term += url_start_path; path_term.append(file, root.size(), string::npos); From 67c40177dccf7fd58daa4f3cc7735d5f5d9e82e8 Mon Sep 17 00:00:00 2001 From: Bruno Baruffaldi Date: Mon, 8 Apr 2019 16:37:29 -0300 Subject: [PATCH 3/5] fixup! Delaying the creation of the document. Moving the creation of the document from omindex.cc to index_file.cc, and in this way we avoid creating it prematurely. --- xapian-applications/omega/index_file.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index a4954382beb..5fdace88069 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -572,7 +572,8 @@ index_mimetype(const string & file, const string & urlterm, const string & url, if (mimetype.empty()) { mimetype = d.get_magic_mimetype(); if (mimetype.empty()) { - skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format", + skip(urlterm, file.substr(root.size()), + "Unknown extension and unrecognised format", d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME); return; } @@ -590,7 +591,8 @@ index_mimetype(const string & file, const string & urlterm, const string & url, while ((j = path_term.rfind('/')) > 1 && j != string::npos) { path_term.resize(j); if (path_term.length() > MAX_SAFE_TERM_LENGTH) { - newdocument.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH)); + string term_hash = hash_long_term(path_term, MAX_SAFE_TERM_LENGTH); + newdocument.add_boolean_term(term_hash); } else { newdocument.add_boolean_term(path_term); } From fcd96dba2fac977d9986ec71eb5e595c19a5dc5a Mon Sep 17 00:00:00 2001 From: Bruno Baruffaldi Date: Thu, 11 Apr 2019 12:43:14 -0300 Subject: [PATCH 4/5] fixup! Delaying the creation of the document. Moving the creation of the document from omindex.cc to index_file.cc, and in this way we avoid creating it prematurely. --- xapian-applications/omega/index_file.cc | 21 ++++++++++++--------- xapian-applications/omega/index_file.h | 5 +++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index 5fdace88069..084058e21db 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -50,6 +50,7 @@ #include "atomparse.h" #include "diritor.h" #include "failed.h" +#include "hashterm.h" #include "md5wrap.h" #include "metaxmlparse.h" #include "mimemap.h" @@ -69,7 +70,6 @@ #include "xmlparse.h" #include "xlsxparse.h" #include "xpsxmlparse.h" -#include "hashterm.h" using namespace std; @@ -533,11 +533,14 @@ index_add_document(const string & urlterm, time_t last_altered, void index_mimetype(const string & file, const string & urlterm, const string & url, const string & ext, - string &mimetype, DirectoryIterator &d, - string & path_term, + const string & mime_type, + DirectoryIterator & d, + const string & path_term, string record) { string context(file, root.size(), string::npos); + string mimetype = mime_type; + string pathterm = path_term; // FIXME: We could be cleverer here and check mtime too when use_ctime is // set - if the ctime has changed but the mtime is unchanged, we can just @@ -573,7 +576,7 @@ index_mimetype(const string & file, const string & urlterm, const string & url, mimetype = d.get_magic_mimetype(); if (mimetype.empty()) { skip(urlterm, file.substr(root.size()), - "Unknown extension and unrecognised format", + "Unknown extension and unrecognised format", d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME); return; } @@ -588,13 +591,13 @@ index_mimetype(const string & file, const string & urlterm, const string & url, // `/home/olly/public_html/foo/bar`. Xapian::Document newdocument; size_t j; - while ((j = path_term.rfind('/')) > 1 && j != string::npos) { - path_term.resize(j); - if (path_term.length() > MAX_SAFE_TERM_LENGTH) { - string term_hash = hash_long_term(path_term, MAX_SAFE_TERM_LENGTH); + while ((j = pathterm.rfind('/')) > 1 && j != string::npos) { + pathterm.resize(j); + if (pathterm.length() > MAX_SAFE_TERM_LENGTH) { + string term_hash = hash_long_term(pathterm, MAX_SAFE_TERM_LENGTH); newdocument.add_boolean_term(term_hash); } else { - newdocument.add_boolean_term(path_term); + newdocument.add_boolean_term(pathterm); } } diff --git a/xapian-applications/omega/index_file.h b/xapian-applications/omega/index_file.h index 65cd219d00d..6b4c2d7a325 100644 --- a/xapian-applications/omega/index_file.h +++ b/xapian-applications/omega/index_file.h @@ -111,8 +111,9 @@ void index_mimetype(const std::string & file, const std::string & urlterm, const std::string & url, const std::string & ext, - std::string &mimetype, DirectoryIterator &d, - std::string &path_term, + const std::string & mime_type, + DirectoryIterator & d, + const std::string & path_term, std::string record); /// Delete any previously indexed documents we haven't seen. From 570a04610c2534c7f66b2a5ada67beb8600265bb Mon Sep 17 00:00:00 2001 From: Bruno Baruffaldi Date: Fri, 12 Apr 2019 22:09:27 -0300 Subject: [PATCH 5/5] fixup! fixup! Delaying the creation of the document. Moving the creation of the document from omindex.cc to index_file.cc, and in this way we avoid creating it prematurely. --- xapian-applications/omega/index_file.cc | 6 ++---- xapian-applications/omega/index_file.h | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index 084058e21db..bb770ac1dfd 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -533,14 +533,12 @@ index_add_document(const string & urlterm, time_t last_altered, void index_mimetype(const string & file, const string & urlterm, const string & url, const string & ext, - const string & mime_type, + string mimetype, DirectoryIterator & d, - const string & path_term, + string pathterm, string record) { string context(file, root.size(), string::npos); - string mimetype = mime_type; - string pathterm = path_term; // FIXME: We could be cleverer here and check mtime too when use_ctime is // set - if the ctime has changed but the mtime is unchanged, we can just diff --git a/xapian-applications/omega/index_file.h b/xapian-applications/omega/index_file.h index 6b4c2d7a325..4a6e278962e 100644 --- a/xapian-applications/omega/index_file.h +++ b/xapian-applications/omega/index_file.h @@ -111,9 +111,9 @@ void index_mimetype(const std::string & file, const std::string & urlterm, const std::string & url, const std::string & ext, - const std::string & mime_type, + std::string mimetype, DirectoryIterator & d, - const std::string & path_term, + std::string pathterm, std::string record); /// Delete any previously indexed documents we haven't seen.