blob: 17149f54fd9e9ff01da129be04c4918879829302 [file] [log] [blame]
// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
#define ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
#include <cstdint>
#include <memory>
#include <set>
#include <stack>
#include <string>
#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/index/embed/embedding-index.h"
#include "icing/index/embed/embedding-query-results.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/numeric-index.h"
#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
#include "icing/query/advanced_query_parser/function.h"
#include "icing/query/advanced_query_parser/pending-value.h"
#include "icing/query/query-features.h"
#include "icing/query/query-results.h"
#include "icing/query/query-terms.h"
#include "icing/schema/schema-store.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
#include <google/protobuf/repeated_field.h>
namespace icing {
namespace lib {
// The Visitor used to create the DocHitInfoIterator tree from the AST output by
// the parser.
class QueryVisitor : public AbstractSyntaxTreeVisitor {
public:
explicit QueryVisitor(
Index* index, const NumericIndex<int64_t>* numeric_index,
const EmbeddingIndex* embedding_index,
const DocumentStore* document_store, const SchemaStore* schema_store,
const Normalizer* normalizer, const Tokenizer* tokenizer,
std::string_view raw_query_text,
const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>*
embedding_query_vectors,
DocHitInfoIteratorFilter::Options filter_options,
TermMatchType::Code match_type,
SearchSpecProto::EmbeddingQueryMetricType::Code
embedding_query_metric_type,
bool needs_term_frequency_info, int64_t current_time_ms)
: QueryVisitor(index, numeric_index, embedding_index, document_store,
schema_store, normalizer, tokenizer, raw_query_text,
embedding_query_vectors, filter_options, match_type,
embedding_query_metric_type, needs_term_frequency_info,
PendingPropertyRestricts(),
/*processing_not=*/false, current_time_ms) {}
void VisitFunctionName(const FunctionNameNode* node) override;
void VisitString(const StringNode* node) override;
void VisitText(const TextNode* node) override;
void VisitMember(const MemberNode* node) override;
void VisitFunction(const FunctionNode* node) override;
void VisitUnaryOperator(const UnaryOperatorNode* node) override;
void VisitNaryOperator(const NaryOperatorNode* node) override;
// RETURNS:
// - the QueryResults reflecting the AST that was visited
// - INVALID_ARGUMENT if the AST does not conform to supported expressions
// - NOT_FOUND if the AST refers to a property that does not exist
libtextclassifier3::StatusOr<QueryResults> ConsumeResults() &&;
private:
// An internal class to help manage property restricts being applied at
// different levels.
class PendingPropertyRestricts {
public:
// Add another set of property restricts. Elements of new_restricts that are
// not present in active_property_rest
void AddValidRestricts(std::set<std::string> new_restricts);
// Pops the most recently added set of property restricts.
void PopRestricts() {
if (has_active_property_restricts()) {
pending_property_restricts_.pop_back();
}
}
bool has_active_property_restricts() const {
return !pending_property_restricts_.empty();
}
// The set of all property restrictions that are currently being applied.
const std::set<std::string>& active_property_restricts() const {
return pending_property_restricts_.back();
}
private:
std::vector<std::set<std::string>> pending_property_restricts_;
};
explicit QueryVisitor(
Index* index, const NumericIndex<int64_t>* numeric_index,
const EmbeddingIndex* embedding_index,
const DocumentStore* document_store, const SchemaStore* schema_store,
const Normalizer* normalizer, const Tokenizer* tokenizer,
std::string_view raw_query_text,
const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>*
embedding_query_vectors,
DocHitInfoIteratorFilter::Options filter_options,
TermMatchType::Code match_type,
SearchSpecProto::EmbeddingQueryMetricType::Code
embedding_query_metric_type,
bool needs_term_frequency_info,
PendingPropertyRestricts pending_property_restricts, bool processing_not,
int64_t current_time_ms)
: index_(*index),
numeric_index_(*numeric_index),
embedding_index_(*embedding_index),
document_store_(*document_store),
schema_store_(*schema_store),
normalizer_(*normalizer),
tokenizer_(*tokenizer),
raw_query_text_(raw_query_text),
embedding_query_vectors_(embedding_query_vectors),
filter_options_(std::move(filter_options)),
match_type_(match_type),
embedding_query_metric_type_(embedding_query_metric_type),
needs_term_frequency_info_(needs_term_frequency_info),
pending_property_restricts_(std::move(pending_property_restricts)),
processing_not_(processing_not),
expecting_numeric_arg_(false),
current_time_ms_(current_time_ms) {
RegisterFunctions();
}
bool has_pending_error() const { return !pending_error_.ok(); }
// Creates a DocHitInfoIterator reflecting the provided term and whether the
// prefix operator has been applied to this term. Also populates,
// property_query_terms_map_ and query_term_iterators_ as appropriate.
// Returns:
// - On success, a DocHitInfoIterator for the provided term
// - INVALID_ARGUMENT if unable to create an iterator for the term.
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
CreateTermIterator(const QueryTerm& term);
// Processes the PendingValue at the top of pending_values_, parses it into a
// int64_t and pops the top.
// Returns:
// - On success, the int value stored in the text at the top
// - INVALID_ARGUMENT if pending_values_ is empty, doesn't hold a text or
// can't be parsed as an int.
libtextclassifier3::StatusOr<int64_t> PopPendingIntValue();
// Processes the PendingValue at the top of pending_values_ and pops the top.
// Returns:
// - On success, the string value stored in the text at the top and a bool
// indicating whether or not the string value has a prefix operator.
// - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string.
libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue();
// Processes the PendingValue at the top of pending_values_ and pops the top.
// Returns:
// - On success, the string value stored in the text at the top
// indicating whether or not the string value has a prefix operator.
// - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text.
libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue();
// Processes the PendingValue at the top of pending_values_ and pops the top.
// Returns:
// - On success, a DocHitInfoIterator representing for the term at the top
// - INVALID_ARGUMENT if pending_values_ is empty or if unable to create an
// iterator for the term.
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
PopPendingIterator();
// Processes all PendingValues at the top of pending_values_ until the first
// placeholder is encounter.
// Returns:
// - On success, a vector containing all DocHitInfoIterators representing
// the values at the top of pending_values_
// - INVALID_ARGUMENT if pending_values_is empty or if unable to create an
// iterator for any of the terms at the top of pending_values_
libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
PopAllPendingIterators();
// Processes the unary operator node as a NOT operator. A NOT can have an
// operator type of "NOT" or "MINUS"
//
// RETURNS:
// - OK on success
// - INVALID_ARGUMENT if any errors are encountered while processing
// node->child
libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node);
// Processes the unary operator node as a negation operator. A negation
// operator should have an operator of type "MINUS" and it's children must
// resolve to a numeric value.
//
// RETURNS:
// - OK on success
// - INVALID_ARGUMENT if the node->child can't be resolved to a numeric
// value.
libtextclassifier3::Status ProcessNegationOperator(
const UnaryOperatorNode* node);
// Processes the NumericComparator represented by node. This must be called
// *after* this node's children have been visited. The PendingValues added by
// this node's children will be consumed by this function and the PendingValue
// for this node will be returned.
// Returns:
// - On success, OK
// - INVALID_ARGUMENT if unable to retrieve string value or int value
// - NOT_FOUND if there is no entry in the numeric index for the property
libtextclassifier3::Status ProcessNumericComparator(
const NaryOperatorNode* node);
// Processes the AND and OR operators represented by the node. This must be
// called *after* this node's children have been visited. The PendingValues
// added by this node's children will be consumed by this function and the
// PendingValue for this node will be returned.
// Returns:
// - On success, then PendingValue representing this node and it's children.
// - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
// children.
libtextclassifier3::StatusOr<PendingValue> ProcessAndOperator(
const NaryOperatorNode* node);
// Processes the OR operator represented by the node. This must be called
// *after* this node's children have been visited. The PendingValues added by
// this node's children will be consumed by this function and the PendingValue
// for this node will be returned.
// Returns:
// - On success, then PendingValue representing this node and it's children.
// - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
// children.
libtextclassifier3::StatusOr<PendingValue> ProcessOrOperator(
const NaryOperatorNode* node);
// Populates registered_functions with the currently supported set of
// functions.
void RegisterFunctions();
// Implementation of `search` custom function in the query language.
// Returns:
// - a PendingValue holding the DocHitInfoIterator reflecting the query
// provided to SearchFunction
// - any errors returned by Lexer::ExtractTokens, Parser::ConsumeQuery or
// QueryVisitor::ConsumeResults.
libtextclassifier3::StatusOr<PendingValue> SearchFunction(
std::vector<PendingValue>&& args);
// Implementation of the propertyDefined(property_path) custom function.
// Returns:
// - a Pending Value holding a DocHitIterator that returns hits for all
// documents whose schema types have defined the property specified by
// property_path.
// - any errors returned by Lexer::ExtractTokens
libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction(
std::vector<PendingValue>&& args);
// Implementation of the hasProperty(property_path) custom function.
// Returns:
// - a Pending Value holding a DocHitIterator that returns hits for all
// documents that have the property specified by property_path.
// - any errors returned by Lexer::ExtractTokens
libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction(
std::vector<PendingValue>&& args);
// Implementation of the semanticSearch(vector, low, high, metric) custom
// function. This function is used for supporting vector search with a
// syntax like `semanticSearch(getSearchSpecEmbedding(0), 0.5, 1, "COSINE")`.
//
// low, high, metric are optional parameters:
// - low is default to negative infinity
// - high is default to positive infinity
// - metric is default to the metric specified in SearchSpec
//
// Returns:
// - a Pending Value of type DocHitIterator that returns all documents with
// an embedding vector that has a score within [low, high].
// - any errors returned by Lexer::ExtractTokens
libtextclassifier3::StatusOr<PendingValue> SemanticSearchFunction(
std::vector<PendingValue>&& args);
// Handles a NaryOperatorNode where the operator is HAS (':') and pushes an
// iterator with the proper section filter applied. If the current property
// restriction represented by pending_property_restricts and the first child
// of this node is unsatisfiable (ex. `prop1:(prop2:foo)`), then a NONE
// iterator is returned immediately and subtree represented by the second
// child is not traversed.
//
// Returns:
// - OK on success
// - INVALID_ARGUMENT node does not have exactly two children or the two
// children cannot be resolved to a MEMBER or an iterator respectively.
libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node);
// Returns the correct match type to apply based on both the match type and
// whether the prefix operator is currently present.
TermMatchType::Code GetTermMatchType(bool is_prefix) const {
return (is_prefix) ? TermMatchType::PREFIX : match_type_;
}
std::stack<PendingValue> pending_values_;
libtextclassifier3::Status pending_error_;
// A map from function name to Function instance.
std::unordered_map<std::string, Function> registered_functions_;
SectionRestrictQueryTermsMap property_query_terms_map_;
QueryTermIteratorsMap query_term_iterators_;
EmbeddingQueryResults embedding_query_results_;
// Set of features invoked in the query.
std::unordered_set<Feature> features_;
Index& index_; // Does not own!
const NumericIndex<int64_t>& numeric_index_; // Does not own!
const EmbeddingIndex& embedding_index_; // Does not own!
const DocumentStore& document_store_; // Does not own!
const SchemaStore& schema_store_; // Does not own!
const Normalizer& normalizer_; // Does not own!
const Tokenizer& tokenizer_; // Does not own!
std::string_view raw_query_text_;
const google::protobuf::RepeatedPtrField<PropertyProto::VectorProto>*
embedding_query_vectors_; // Nullable, does not own!
DocHitInfoIteratorFilter::Options filter_options_;
TermMatchType::Code match_type_;
SearchSpecProto::EmbeddingQueryMetricType::Code embedding_query_metric_type_;
// Whether or not term_frequency information is needed. This affects:
// - how DocHitInfoIteratorTerms are constructed
// - whether the QueryTermIteratorsMap is populated in the QueryResults.
bool needs_term_frequency_info_;
// The stack of property restricts currently being processed by the visitor.
PendingPropertyRestricts pending_property_restricts_;
bool processing_not_;
// Whether we are in the midst of processing a subtree that is expected to
// resolve to a numeric argument.
bool expecting_numeric_arg_;
int64_t current_time_ms_;
};
} // namespace lib
} // namespace icing
#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_