blob: ebfb2f8f47e899a748b07cfce0dcc21c2a9d3fd1 [file] [log] [blame]
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package org.chromium.base;
import android.text.TextUtils;
import android.util.Patterns;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** Provides public methods for detecting and eliding sensitive PII. */
public class PiiElider {
private static final String EMAIL_ELISION = "XXX@EMAIL.ELIDED";
private static final String URL_ELISION = "HTTP://WEBADDRESS.ELIDED";
private static final String GOOD_IRI_CHAR = "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
private static final String IP_ADDRESS =
"((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]"
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]"
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ "|[1-9][0-9]|[0-9]))";
private static final String IRI =
"[" + GOOD_IRI_CHAR + "]([" + GOOD_IRI_CHAR + "-]{0,61}[" + GOOD_IRI_CHAR + "]){0,1}";
private static final String GOOD_GTLD_CHAR = "a-zA-Z\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
private static final String GTLD = "[" + GOOD_GTLD_CHAR + "]{2,63}";
private static final String HOST_NAME = "(" + IRI + "\\.)+" + GTLD;
private static final String URI_ENCODED_CHAR = "(%[a-fA-F0-9]{2})";
private static final String URI_CHAR = "([a-zA-Z0-9$_.+!*'(),;?&=-]|" + URI_ENCODED_CHAR + ")";
private static final String PATH_CHAR =
// Either a single valid path component character or a URI-encoded character.
"(([" + GOOD_IRI_CHAR + ";/?:@&=#~.+!*'(),_-])|" + URI_ENCODED_CHAR + ")";
private static final String URI_SCHEME =
"((http|https|Http|Https|rtsp|Rtsp)://"
+ "("
+ URI_CHAR
+ "{1,64}(:"
+ URI_CHAR
+ "{1,25})?@)?)";
private static final String DOMAIN_NAME = "(" + HOST_NAME + "|" + IP_ADDRESS + ")";
private static final String PORT = "(:\\d{1,5})";
private static final String URL_WITH_OPTIONAL_SCHEME_AND_PORT =
"(" + URI_SCHEME + "?" + DOMAIN_NAME + PORT + "?)";
private static final String PATH_COMPONENT = "(" + PATH_CHAR + "+)";
// Based on: http://www.faqs.org/rfcs/rfc2396.html#:~:text=Scheme%20Component
private static final String INTENT_SCHEME = "[a-zA-Z][a-zA-Z0-9+.-]+://";
private static final String INTENT = "(" + INTENT_SCHEME + PATH_COMPONENT + ")";
private static final String URL_OR_INTENT =
"(" + URL_WITH_OPTIONAL_SCHEME_AND_PORT + "|" + INTENT + ")";
private static final Pattern WEB_URL =
Pattern.compile(
"(\\b|^)" // Always start on a word boundary or start of string.
+ "("
+ URL_OR_INTENT
+ ")" // Main URL or Intent scheme/domain/root path.
+ "(/"
+ PATH_CHAR
+ "*)?" // Rest of the URI path.
+ "(\\b|$)"); // Always end on a word boundary or end of string.
// Example variant info chromium-TrichromeChromeGoogle6432.aab
private static final String CHROME_VARIANT_INFO = "chromium-[^\\.]+\\.aab";
private static final Pattern LIKELY_EXCEPTION_LOG =
Pattern.compile(
"\\sat\\s"
// These are all package prefixes of classes that are likely to
// exist on a stacktrace and are very unlikely to be a PII url.
+ "(org\\.chromium|com\\.google|java|android|com\\.android)\\.[^ ]+.|"
// if a line has what looks like line number info, it's probably an
// exception log.
+ "\\("
+ CHROME_VARIANT_INFO
+ "[^:]+:\\d+\\)|"
// When a class is not found it can fail to satisfy our isClass
// check but is still worth noting what it was.
+ "Caused by: java\\.lang\\."
+ "(ClassNotFoundException|NoClassDefFoundError):");
private static final String IP_ELISION = "1.2.3.4";
private static final String MAC_ELISION = "01:23:45:67:89:AB";
private static final String CONSOLE_ELISION = "[ELIDED:CONSOLE(0)] ELIDED CONSOLE MESSAGE";
private static final Pattern MAC_ADDRESS =
Pattern.compile("([0-9a-fA-F]{2}[-:]+){5}[0-9a-fA-F]{2}");
private static final Pattern CONSOLE_MSG = Pattern.compile("\\[\\w*:CONSOLE.*\\].*");
private static final String[] APP_NAMESPACE =
new String[] {"org.chromium.", "com.google.", "com.chrome."};
private static final String[] SYSTEM_NAMESPACE =
new String[] {
"android.",
"com.android.",
"dalvik.",
"java.",
"javax.",
"org.apache.",
"org.json.",
"org.w3c.dom.",
"org.xml.",
"org.xmlpull.",
"System."
};
/**
* Elides any emails in the specified {@link String} with
* {@link #EMAIL_ELISION}.
*
* @param original String potentially containing emails.
* @return String with elided emails.
*/
public static String elideEmail(String original) {
return Patterns.EMAIL_ADDRESS.matcher(original).replaceAll(EMAIL_ELISION);
}
/**
* Elides any URLs in the specified {@link String} with
* {@link #URL_ELISION}.
*
* @param original String potentially containing URLs.
* @return String with elided URLs.
*/
public static String elideUrl(String original) {
// Url-matching is fussy. If something looks like an exception message, just return.
if (LIKELY_EXCEPTION_LOG.matcher(original).find()) return original;
StringBuilder buffer = new StringBuilder(original);
Matcher matcher = WEB_URL.matcher(buffer);
int start = 0;
while (matcher.find(start)) {
start = matcher.start();
int end = matcher.end();
String url = buffer.substring(start, end);
if (!likelyToBeAppNamespace(url)
&& !likelyToBeSystemNamespace(url)
&& !likelyToBeClassOrMethodName(url)) {
buffer.replace(start, end, URL_ELISION);
end = start + URL_ELISION.length();
matcher = WEB_URL.matcher(buffer);
}
start = end;
}
return buffer.toString();
}
private static boolean likelyToBeClassOrMethodName(String url) {
if (isClassName(url)) return true;
// Since the suspected URL could actually be a method name, check if the portion preceding
// the last subdomain is a class name.
int indexOfLastPeriod = url.lastIndexOf(".");
if (indexOfLastPeriod == -1) return false;
return isClassName(url.substring(0, indexOfLastPeriod));
}
private static boolean isClassName(String url) {
try {
Class.forName(url, false, ContextUtils.getApplicationContext().getClassLoader());
return true;
} catch (Throwable e) {
// Some examples: ClassNotFoundException, NoClassDefFoundException, VerifyError.
}
return false;
}
private static boolean likelyToBeAppNamespace(String url) {
for (String ns : APP_NAMESPACE) {
if (url.startsWith(ns)) {
return true;
}
}
return false;
}
private static boolean likelyToBeSystemNamespace(String url) {
for (String ns : SYSTEM_NAMESPACE) {
if (url.startsWith(ns)) {
return true;
}
}
return false;
}
/**
* Elides any IP addresses in the specified {@link String} with
* {@link #IP_ELISION}.
*
* @param original String potentially containing IPs.
* @return String with elided IPs.
*/
public static String elideIp(String original) {
return Patterns.IP_ADDRESS.matcher(original).replaceAll(IP_ELISION);
}
/**
* Elides any MAC addresses in the specified {@link String} with
* {@link #MAC_ELISION}.
*
* @param original String potentially containing MACs.
* @return String with elided MACs.
*/
public static String elideMac(String original) {
return MAC_ADDRESS.matcher(original).replaceAll(MAC_ELISION);
}
/**
* Elides any console messages in the specified {@link String} with
* {@link #CONSOLE_ELISION}.
*
* @param original String potentially containing console messages.
* @return String with elided console messages.
*/
public static String elideConsole(String original) {
return CONSOLE_MSG.matcher(original).replaceAll(CONSOLE_ELISION);
}
/**
* Elides any URL in the exception messages contained inside a stacktrace with
* {@link #URL_ELISION}.
*
* @param stacktrace Multiline stacktrace as a string.
* @return Stacktrace with elided URLs.
*/
public static String sanitizeStacktrace(String stacktrace) {
if (TextUtils.isEmpty(stacktrace)) {
return "";
}
String[] frames = stacktrace.split("\\n");
// Sanitize first stacktrace line which contains the exception message.
frames[0] = elideUrl(frames[0]);
for (int i = 1; i < frames.length; i++) {
// Nested exceptions should also have their message sanitized.
if (frames[i].startsWith("Caused by:")) {
frames[i] = elideUrl(frames[i]);
}
}
return TextUtils.join("\n", frames);
}
}