blob: e10fe25d80083334082309fe132a54ec30515f03 [file] [log] [blame]
// Copyright (C) 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/util/tokenized-document.h"
#include <memory>
#include <string_view>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/proto/document.pb.h"
#include "icing/schema/joinable-property.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/util/document-validator.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
namespace {
libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize(
const SchemaStore* schema_store,
const LanguageSegmenter* language_segmenter,
const std::vector<Section<std::string_view>>& string_sections) {
std::vector<TokenizedSection> tokenized_string_sections;
for (const Section<std::string_view>& section : string_sections) {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
section.metadata.tokenizer, language_segmenter));
std::vector<std::string_view> token_sequence;
for (std::string_view subcontent : section.content) {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
tokenizer->Tokenize(subcontent));
while (itr->Advance()) {
std::vector<Token> batch_tokens = itr->GetTokens();
for (const Token& token : batch_tokens) {
token_sequence.push_back(token.text);
}
}
}
tokenized_string_sections.emplace_back(SectionMetadata(section.metadata),
std::move(token_sequence));
}
return tokenized_string_sections;
}
} // namespace
/* static */ libtextclassifier3::StatusOr<TokenizedDocument>
TokenizedDocument::Create(const SchemaStore* schema_store,
const LanguageSegmenter* language_segmenter,
DocumentProto document) {
DocumentValidator validator(schema_store);
ICING_RETURN_IF_ERROR(validator.Validate(document));
ICING_ASSIGN_OR_RETURN(SectionGroup section_group,
schema_store->ExtractSections(document));
ICING_ASSIGN_OR_RETURN(JoinablePropertyGroup joinable_property_group,
schema_store->ExtractJoinableProperties(document));
// Tokenize string sections
ICING_ASSIGN_OR_RETURN(
std::vector<TokenizedSection> tokenized_string_sections,
Tokenize(schema_store, language_segmenter,
section_group.string_sections));
return TokenizedDocument(std::move(document),
std::move(tokenized_string_sections),
std::move(section_group.integer_sections),
std::move(section_group.vector_sections),
std::move(joinable_property_group));
}
} // namespace lib
} // namespace icing