blob: 996a883c48a0a8819be054630710b8eec8b45d76 [file] [log] [blame]
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "net/tools/tld_cleanup/tld_cleanup_util.h"
#include <sstream>
#include <string>
#include "base/containers/contains.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/ranges/algorithm.h"
#include "base/strings/strcat.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "url/gurl.h"
#include "url/third_party/mozilla/url_parse.h"
namespace {
const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
const int kExceptionRule = 1;
const int kWildcardRule = 2;
const int kPrivateRule = 4;
}
namespace net::tld_cleanup {
std::string RulesToGperf(const RuleMap& rules) {
std::string data;
data.append("%{\n"
"// Copyright 2012 The Chromium Authors\n"
"// Use of this source code is governed by a BSD-style license "
"that can be\n"
"// found in the LICENSE file.\n\n"
"// This file is generated by net/tools/tld_cleanup/.\n"
"// DO NOT MANUALLY EDIT!\n"
"%}\n"
"struct DomainRule {\n"
" int name_offset;\n"
" int type; // flags: 1: exception, 2: wildcard, 4: private\n"
"};\n"
"%%\n");
for (const auto& [domain, rule] : rules) {
data.append(domain);
data.append(", ");
int type = 0;
if (rule.exception) {
type = kExceptionRule;
} else if (rule.wildcard) {
type = kWildcardRule;
}
if (rule.is_private) {
type += kPrivateRule;
}
data.append(base::NumberToString(type));
data.append("\n");
}
data.append("%%\n");
return data;
}
// Adjusts the rule to a standard form: removes single extraneous dots and
// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
// valid; logs a warning and returns kWarning if it is probably invalid; and
// logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeResult NormalizeRule(std::string& domain, Rule& rule) {
NormalizeResult result = NormalizeResult::kSuccess;
// Strip single leading and trailing dots.
if (domain.starts_with(".")) {
domain.erase(0, 1);
}
if (domain.ends_with(".")) {
domain.pop_back();
}
// Allow single leading '*.' or '!', saved here so it's not canonicalized.
if (domain.starts_with("!")) {
domain.erase(0, 1);
rule.exception = true;
} else if (domain.starts_with("*.")) {
domain.erase(0, 2);
rule.wildcard = true;
}
if (domain.empty()) {
LOG(WARNING) << "Ignoring empty rule";
return NormalizeResult::kWarning;
}
// Warn about additional '*.' or '!'.
if (base::Contains(domain, "*.") || base::Contains(domain, '!')) {
LOG(WARNING) << "Keeping probably invalid rule: " << domain;
result = NormalizeResult::kWarning;
}
// Make a GURL and normalize it, then get the host back out.
GURL gurl(base::StrCat({"http://", domain}));
const std::string& spec = gurl.possibly_invalid_spec();
url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
if (!host.is_valid()) {
LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain;
return NormalizeResult::kError;
}
if (!gurl.is_valid()) {
LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain;
result = NormalizeResult::kWarning;
}
domain.assign(spec.substr(host.begin, host.len));
return result;
}
NormalizeResult NormalizeDataToRuleMap(const std::string& data,
RuleMap& rules) {
// We do a lot of string assignment during parsing, but simplicity is more
// important than performance here.
NormalizeResult result = NormalizeResult::kSuccess;
std::istringstream data_stream(data);
bool in_private_section = false;
RuleMap extra_rules;
for (std::string line; std::getline(data_stream, line, '\n');) {
if (line.starts_with(kBeginPrivateDomainsComment)) {
in_private_section = true;
continue;
}
if (line.starts_with(kEndPrivateDomainsComment)) {
in_private_section = false;
continue;
}
if (line.starts_with("//")) {
// Skip comments.
continue;
}
if (line.empty()) {
continue;
}
// Truncate at first whitespace.
if (size_t first_whitespace = line.find_first_of("\r\n \t");
first_whitespace != std::string::npos) {
line.erase(first_whitespace);
}
std::string domain = line;
Rule rule{/*exception=*/false, /*wildcard=*/false,
/*is_private=*/in_private_section};
NormalizeResult new_result = NormalizeRule(domain, rule);
result = std::max(result, new_result);
if (new_result == NormalizeResult::kError) {
continue;
}
// Check the existing rules to make sure we don't have an exception and
// wildcard for the same rule, or that the same domain is listed as both
// private and not private. If we did, we'd have to update our
// parsing code to handle this case.
CHECK(!base::Contains(rules, domain))
<< "Duplicate rule found for " << domain;
rules[domain] = rule;
// Add true TLD for multi-level rules. We don't add them right now, in
// case there's an exception or wild card that either exists or might be
// added in a later iteration. In those cases, there's no need to add
// it and it would just slow down parsing the data.
size_t tld_start = domain.find_last_of('.');
if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
std::string extra_rule_domain = domain.substr(tld_start + 1);
RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
// If a rule already exists, we ensure that if any of the entries is not
// private the result should be that the entry is not private. An example
// is .au which is not listed as a real TLD, but only lists second-level
// domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are
// also listed in the private section, which is processed later, so this
// ensures that the real TLD (eg. .au) is listed as public.
bool is_private = in_private_section &&
(iter == extra_rules.end() || iter->second.is_private);
extra_rules[extra_rule_domain] =
Rule{/*exception=*/false, /*wildcard=*/false, is_private};
}
}
base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()),
[&](const auto& extra_rule) {
return !base::Contains(rules, extra_rule.first);
});
return result;
}
NormalizeResult NormalizeFile(const base::FilePath& in_filename,
const base::FilePath& out_filename) {
RuleMap rules;
std::string data;
if (!base::ReadFileToString(in_filename, &data)) {
LOG(ERROR) << "Unable to read file";
// We return success since we've already reported the error.
return NormalizeResult::kSuccess;
}
NormalizeResult result = NormalizeDataToRuleMap(data, rules);
if (!base::WriteFile(out_filename, RulesToGperf(rules))) {
LOG(ERROR) << "Error(s) writing output file";
result = NormalizeResult::kError;
}
return result;
}
} // namespace net::tld_cleanup