blob: ed711c14b439885da47802195717abc0434ced5e [file] [log] [blame]
// Copyright (C) 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/index/embedding-indexing-handler.h"
#include <cstdint>
#include <initializer_list>
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/portable-file-backed-proto-log.h"
#include "icing/index/embed/embedding-hit.h"
#include "icing/index/embed/embedding-index.h"
#include "icing/index/embed/posting-list-embedding-hit-accessor.h"
#include "icing/index/hit/hit.h"
#include "icing/portable/platform.h"
#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/embedding-test-utils.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/util/status-macros.h"
#include "icing/util/tokenized-document.h"
#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::IsTrue;
// Indexable properties (section) and section id. Section id is determined by
// the lexicographical order of indexable property paths.
// Schema type with indexable properties: FakeType
// Section id = 0: "body"
// Section id = 1: "bodyEmbedding"
// Section id = 2: "title"
// Section id = 3: "titleEmbedding"
static constexpr std::string_view kFakeType = "FakeType";
static constexpr std::string_view kPropertyBody = "body";
static constexpr std::string_view kPropertyBodyEmbedding = "bodyEmbedding";
static constexpr std::string_view kPropertyTitle = "title";
static constexpr std::string_view kPropertyTitleEmbedding = "titleEmbedding";
static constexpr std::string_view kPropertyNonIndexableEmbedding =
"nonIndexableEmbedding";
static constexpr SectionId kSectionIdBodyEmbedding = 1;
static constexpr SectionId kSectionIdTitleEmbedding = 3;
// Schema type with nested indexable properties: FakeCollectionType
// Section id = 0: "collection.body"
// Section id = 1: "collection.bodyEmbedding"
// Section id = 2: "collection.title"
// Section id = 3: "collection.titleEmbedding"
// Section id = 4: "fullDocEmbedding"
static constexpr std::string_view kFakeCollectionType = "FakeCollectionType";
static constexpr std::string_view kPropertyCollection = "collection";
static constexpr std::string_view kPropertyFullDocEmbedding =
"fullDocEmbedding";
static constexpr SectionId kSectionIdNestedBodyEmbedding = 1;
static constexpr SectionId kSectionIdNestedTitleEmbedding = 3;
static constexpr SectionId kSectionIdFullDocEmbedding = 4;
class EmbeddingIndexingHandlerTest : public ::testing::Test {
protected:
void SetUp() override {
if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
}
base_dir_ = GetTestTempDir() + "/icing_test";
ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
IsTrue());
embedding_index_working_path_ = base_dir_ + "/embedding_index";
schema_store_dir_ = base_dir_ + "/schema_store";
document_store_dir_ = base_dir_ + "/document_store";
ICING_ASSERT_OK_AND_ASSIGN(
embedding_index_,
EmbeddingIndex::Create(&filesystem_, embedding_index_working_path_));
language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
ICING_ASSERT_OK_AND_ASSIGN(
lang_segmenter_,
language_segmenter_factory::Create(std::move(segmenter_options)));
ASSERT_THAT(
filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
IsTrue());
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
SchemaProto schema =
SchemaBuilder()
.AddType(
SchemaTypeConfigBuilder()
.SetType(kFakeType)
.AddProperty(PropertyConfigBuilder()
.SetName(kPropertyTitle)
.SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName(kPropertyBody)
.SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(
PropertyConfigBuilder()
.SetName(kPropertyTitleEmbedding)
.SetDataTypeVector(
EmbeddingIndexingConfig::EmbeddingIndexingType::
LINEAR_SEARCH)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(
PropertyConfigBuilder()
.SetName(kPropertyBodyEmbedding)
.SetDataTypeVector(
EmbeddingIndexingConfig::EmbeddingIndexingType::
LINEAR_SEARCH)
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(PropertyConfigBuilder()
.SetName(kPropertyNonIndexableEmbedding)
.SetDataType(TYPE_VECTOR)
.SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kFakeCollectionType)
.AddProperty(PropertyConfigBuilder()
.SetName(kPropertyCollection)
.SetDataTypeDocument(
kFakeType,
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(
PropertyConfigBuilder()
.SetName(kPropertyFullDocEmbedding)
.SetDataTypeVector(
EmbeddingIndexingConfig::
EmbeddingIndexingType::LINEAR_SEARCH)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(
schema, /*ignore_errors_and_delete_documents=*/false,
/*allow_circular_schema_definitions=*/false));
ASSERT_TRUE(
filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult doc_store_create_result,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
schema_store_.get(),
/*force_recovery_and_revalidate_documents=*/false,
/*namespace_id_fingerprint=*/false,
/*pre_mapping_fbv=*/false,
/*use_persistent_hash_map=*/false,
PortableFileBackedProtoLog<
DocumentWrapper>::kDeflateCompressionLevel,
/*initialize_stats=*/nullptr));
document_store_ = std::move(doc_store_create_result.document_store);
}
void TearDown() override {
document_store_.reset();
schema_store_.reset();
lang_segmenter_.reset();
embedding_index_.reset();
filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
}
libtextclassifier3::StatusOr<std::vector<EmbeddingHit>> GetHits(
uint32_t dimension, std::string_view model_signature) {
std::vector<EmbeddingHit> hits;
libtextclassifier3::StatusOr<
std::unique_ptr<PostingListEmbeddingHitAccessor>>
pl_accessor_or =
embedding_index_->GetAccessor(dimension, model_signature);
std::unique_ptr<PostingListEmbeddingHitAccessor> pl_accessor;
if (pl_accessor_or.ok()) {
pl_accessor = std::move(pl_accessor_or).ValueOrDie();
} else if (absl_ports::IsNotFound(pl_accessor_or.status())) {
return hits;
} else {
return std::move(pl_accessor_or).status();
}
while (true) {
ICING_ASSIGN_OR_RETURN(std::vector<EmbeddingHit> batch,
pl_accessor->GetNextHitsBatch());
if (batch.empty()) {
return hits;
}
hits.insert(hits.end(), batch.begin(), batch.end());
}
}
std::vector<float> GetRawEmbeddingData() {
return std::vector<float>(embedding_index_->GetRawEmbeddingData(),
embedding_index_->GetRawEmbeddingData() +
embedding_index_->GetTotalVectorSize());
}
Filesystem filesystem_;
FakeClock fake_clock_;
std::string base_dir_;
std::string embedding_index_working_path_;
std::string schema_store_dir_;
std::string document_store_dir_;
std::unique_ptr<EmbeddingIndex> embedding_index_;
std::unique_ptr<LanguageSegmenter> lang_segmenter_;
std::unique_ptr<SchemaStore> schema_store_;
std::unique_ptr<DocumentStore> document_store_;
};
} // namespace
TEST_F(EmbeddingIndexingHandlerTest, CreationWithNullPointerShouldFail) {
EXPECT_THAT(EmbeddingIndexingHandler::Create(/*clock=*/nullptr,
embedding_index_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(EmbeddingIndexingHandler::Create(&fake_clock_,
/*embedding_index=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST_F(EmbeddingIndexingHandlerTest, HandleEmbeddingSection) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPropertyTitle), "title")
.AddVectorProperty(std::string(kPropertyTitleEmbedding),
CreateVector("model", {0.1, 0.2, 0.3}))
.AddStringProperty(std::string(kPropertyBody), "body")
.AddVectorProperty(std::string(kPropertyBodyEmbedding),
CreateVector("model", {0.4, 0.5, 0.6}),
CreateVector("model", {0.7, 0.8, 0.9}))
.AddVectorProperty(std::string(kPropertyNonIndexableEmbedding),
CreateVector("model", {1.1, 1.2, 1.3}))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
std::move(document)));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id,
document_store_->Put(tokenized_document.document()));
ASSERT_THAT(embedding_index_->last_added_document_id(),
Eq(kInvalidDocumentId));
// Handle document.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<EmbeddingIndexingHandler> handler,
EmbeddingIndexingHandler::Create(&fake_clock_, embedding_index_.get()));
EXPECT_THAT(
handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
/*put_document_stats=*/nullptr),
IsOk());
// Check index
EXPECT_THAT(
GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(ElementsAre(
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/0),
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/3),
EmbeddingHit(BasicHit(kSectionIdTitleEmbedding, /*document_id=*/0),
/*location=*/6))));
EXPECT_THAT(GetRawEmbeddingData(),
ElementsAre(0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1, 0.2, 0.3));
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id));
}
TEST_F(EmbeddingIndexingHandlerTest, HandleNestedEmbeddingSection) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_collection_type/1")
.SetSchema(std::string(kFakeCollectionType))
.AddDocumentProperty(
std::string(kPropertyCollection),
DocumentBuilder()
.SetKey("icing", "nested_fake_type/1")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPropertyTitle), "title")
.AddVectorProperty(std::string(kPropertyTitleEmbedding),
CreateVector("model", {0.1, 0.2, 0.3}))
.AddStringProperty(std::string(kPropertyBody), "body")
.AddVectorProperty(std::string(kPropertyBodyEmbedding),
CreateVector("model", {0.4, 0.5, 0.6}),
CreateVector("model", {0.7, 0.8, 0.9}))
.AddVectorProperty(
std::string(kPropertyNonIndexableEmbedding),
CreateVector("model", {1.1, 1.2, 1.3}))
.Build())
.AddVectorProperty(std::string(kPropertyFullDocEmbedding),
CreateVector("model", {2.1, 2.2, 2.3}))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
std::move(document)));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id,
document_store_->Put(tokenized_document.document()));
ASSERT_THAT(embedding_index_->last_added_document_id(),
Eq(kInvalidDocumentId));
// Handle document.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<EmbeddingIndexingHandler> handler,
EmbeddingIndexingHandler::Create(&fake_clock_, embedding_index_.get()));
EXPECT_THAT(
handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
/*put_document_stats=*/nullptr),
IsOk());
// Check index
EXPECT_THAT(
GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(ElementsAre(
EmbeddingHit(
BasicHit(kSectionIdNestedBodyEmbedding, /*document_id=*/0),
/*location=*/0),
EmbeddingHit(
BasicHit(kSectionIdNestedBodyEmbedding, /*document_id=*/0),
/*location=*/3),
EmbeddingHit(
BasicHit(kSectionIdNestedTitleEmbedding, /*document_id=*/0),
/*location=*/6),
EmbeddingHit(BasicHit(kSectionIdFullDocEmbedding, /*document_id=*/0),
/*location=*/9))));
EXPECT_THAT(GetRawEmbeddingData(), ElementsAre(0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
0.1, 0.2, 0.3, 2.1, 2.2, 2.3));
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id));
}
TEST_F(EmbeddingIndexingHandlerTest,
HandleInvalidDocumentIdShouldReturnInvalidArgumentError) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPropertyTitle), "title")
.AddVectorProperty(std::string(kPropertyTitleEmbedding),
CreateVector("model", {0.1, 0.2, 0.3}))
.AddStringProperty(std::string(kPropertyBody), "body")
.AddVectorProperty(std::string(kPropertyBodyEmbedding),
CreateVector("model", {0.4, 0.5, 0.6}),
CreateVector("model", {0.7, 0.8, 0.9}))
.AddVectorProperty(std::string(kPropertyNonIndexableEmbedding),
CreateVector("model", {1.1, 1.2, 1.3}))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
std::move(document)));
ICING_ASSERT_OK(document_store_->Put(tokenized_document.document()));
static constexpr DocumentId kCurrentDocumentId = 3;
embedding_index_->set_last_added_document_id(kCurrentDocumentId);
ASSERT_THAT(embedding_index_->last_added_document_id(),
Eq(kCurrentDocumentId));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<EmbeddingIndexingHandler> handler,
EmbeddingIndexingHandler::Create(&fake_clock_, embedding_index_.get()));
// Handling document with kInvalidDocumentId should cause a failure, and both
// index data and last_added_document_id should remain unchanged.
EXPECT_THAT(
handler->Handle(tokenized_document, kInvalidDocumentId,
/*recovery_mode=*/false, /*put_document_stats=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(embedding_index_->last_added_document_id(),
Eq(kCurrentDocumentId));
// Check that the embedding index should be empty
EXPECT_THAT(GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(IsEmpty()));
EXPECT_THAT(GetRawEmbeddingData(), IsEmpty());
// Recovery mode should get the same result.
EXPECT_THAT(
handler->Handle(tokenized_document, kInvalidDocumentId,
/*recovery_mode=*/true, /*put_document_stats=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(embedding_index_->last_added_document_id(),
Eq(kCurrentDocumentId));
// Check that the embedding index should be empty
EXPECT_THAT(GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(IsEmpty()));
EXPECT_THAT(GetRawEmbeddingData(), IsEmpty());
}
TEST_F(EmbeddingIndexingHandlerTest,
HandleOutOfOrderDocumentIdShouldReturnInvalidArgumentError) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPropertyTitle), "title")
.AddVectorProperty(std::string(kPropertyTitleEmbedding),
CreateVector("model", {0.1, 0.2, 0.3}))
.AddStringProperty(std::string(kPropertyBody), "body")
.AddVectorProperty(std::string(kPropertyBodyEmbedding),
CreateVector("model", {0.4, 0.5, 0.6}),
CreateVector("model", {0.7, 0.8, 0.9}))
.AddVectorProperty(std::string(kPropertyNonIndexableEmbedding),
CreateVector("model", {1.1, 1.2, 1.3}))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
std::move(document)));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id,
document_store_->Put(tokenized_document.document()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<EmbeddingIndexingHandler> handler,
EmbeddingIndexingHandler::Create(&fake_clock_, embedding_index_.get()));
// Handling document with document_id == last_added_document_id should cause a
// failure, and both index data and last_added_document_id should remain
// unchanged.
embedding_index_->set_last_added_document_id(document_id);
ASSERT_THAT(embedding_index_->last_added_document_id(), Eq(document_id));
EXPECT_THAT(
handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
/*put_document_stats=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id));
// Check that the embedding index should be empty
EXPECT_THAT(GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(IsEmpty()));
EXPECT_THAT(GetRawEmbeddingData(), IsEmpty());
// Handling document with document_id < last_added_document_id should cause a
// failure, and both index data and last_added_document_id should remain
// unchanged.
embedding_index_->set_last_added_document_id(document_id + 1);
ASSERT_THAT(embedding_index_->last_added_document_id(), Eq(document_id + 1));
EXPECT_THAT(
handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
/*put_document_stats=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id + 1));
// Check that the embedding index should be empty
EXPECT_THAT(GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(IsEmpty()));
EXPECT_THAT(GetRawEmbeddingData(), IsEmpty());
}
TEST_F(EmbeddingIndexingHandlerTest,
HandleRecoveryModeShouldIgnoreDocsLELastAddedDocId) {
DocumentProto document1 =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPropertyTitle), "title one")
.AddVectorProperty(std::string(kPropertyTitleEmbedding),
CreateVector("model", {0.1, 0.2, 0.3}))
.AddStringProperty(std::string(kPropertyBody), "body one")
.AddVectorProperty(std::string(kPropertyBodyEmbedding),
CreateVector("model", {0.4, 0.5, 0.6}),
CreateVector("model", {0.7, 0.8, 0.9}))
.AddVectorProperty(std::string(kPropertyNonIndexableEmbedding),
CreateVector("model", {1.1, 1.2, 1.3}))
.Build();
DocumentProto document2 =
DocumentBuilder()
.SetKey("icing", "fake_type/2")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPropertyTitle), "title two")
.AddVectorProperty(std::string(kPropertyTitleEmbedding),
CreateVector("model", {10.1, 10.2, 10.3}))
.AddStringProperty(std::string(kPropertyBody), "body two")
.AddVectorProperty(std::string(kPropertyBodyEmbedding),
CreateVector("model", {10.4, 10.5, 10.6}),
CreateVector("model", {10.7, 10.8, 10.9}))
.AddVectorProperty(std::string(kPropertyNonIndexableEmbedding),
CreateVector("model", {11.1, 11.2, 11.3}))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document1,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
std::move(document1)));
ICING_ASSERT_OK_AND_ASSIGN(
TokenizedDocument tokenized_document2,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
std::move(document2)));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id1,
document_store_->Put(tokenized_document1.document()));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id2,
document_store_->Put(tokenized_document2.document()));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<EmbeddingIndexingHandler> handler,
EmbeddingIndexingHandler::Create(&fake_clock_, embedding_index_.get()));
// Handle document with document_id > last_added_document_id in recovery mode.
// The handler should index this document and update last_added_document_id.
EXPECT_THAT(
handler->Handle(tokenized_document1, document_id1, /*recovery_mode=*/true,
/*put_document_stats=*/nullptr),
IsOk());
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id1));
// Check index
EXPECT_THAT(
GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(ElementsAre(
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/0),
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/3),
EmbeddingHit(BasicHit(kSectionIdTitleEmbedding, /*document_id=*/0),
/*location=*/6))));
EXPECT_THAT(GetRawEmbeddingData(),
ElementsAre(0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1, 0.2, 0.3));
// Handle document with document_id == last_added_document_id in recovery
// mode. We should not get any error, but the handler should ignore the
// document, so both index data and last_added_document_id should remain
// unchanged.
embedding_index_->set_last_added_document_id(document_id2);
ASSERT_THAT(embedding_index_->last_added_document_id(), Eq(document_id2));
EXPECT_THAT(
handler->Handle(tokenized_document2, document_id2, /*recovery_mode=*/true,
/*put_document_stats=*/nullptr),
IsOk());
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id2));
// Check index
EXPECT_THAT(
GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(ElementsAre(
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/0),
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/3),
EmbeddingHit(BasicHit(kSectionIdTitleEmbedding, /*document_id=*/0),
/*location=*/6))));
EXPECT_THAT(GetRawEmbeddingData(),
ElementsAre(0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1, 0.2, 0.3));
// Handle document with document_id < last_added_document_id in recovery mode.
// We should not get any error, but the handler should ignore the document, so
// both index data and last_added_document_id should remain unchanged.
embedding_index_->set_last_added_document_id(document_id2 + 1);
ASSERT_THAT(embedding_index_->last_added_document_id(), Eq(document_id2 + 1));
EXPECT_THAT(
handler->Handle(tokenized_document2, document_id2, /*recovery_mode=*/true,
/*put_document_stats=*/nullptr),
IsOk());
EXPECT_THAT(embedding_index_->last_added_document_id(), Eq(document_id2 + 1));
// Check index
EXPECT_THAT(
GetHits(/*dimension=*/3, /*model_signature=*/"model"),
IsOkAndHolds(ElementsAre(
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/0),
EmbeddingHit(BasicHit(kSectionIdBodyEmbedding, /*document_id=*/0),
/*location=*/3),
EmbeddingHit(BasicHit(kSectionIdTitleEmbedding, /*document_id=*/0),
/*location=*/6))));
EXPECT_THAT(GetRawEmbeddingData(),
ElementsAre(0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1, 0.2, 0.3));
}
} // namespace lib
} // namespace icing