Bug 414122 - Preprocess effective TLD data into C++ code (eliminate file I/O to read, move parsing out of C++, remove an arena, etc.). r=dwitte, sr=bsmedberg, a=blocker

git-svn-id: svn://10.0.0.236/trunk@245738 18797224-902f-48f8-a5cc-f745e15eee43
This commit is contained in:
jwalden%mit.edu 2008-02-14 22:57:21 +00:00
parent 2b5b1fc86c
commit fe3afbf60d
5 changed files with 273 additions and 249 deletions

View File

@ -68,16 +68,19 @@ CSRCS = race.c \
# static lib.
FORCE_STATIC_LIB = 1
# need to include etld_data.inc
LOCAL_INCLUDES = \
-I$(srcdir)/../../base/src \
-I. \
$(NULL)
include $(topsrcdir)/config/rules.mk
DEFINES += -DIMPL_NS_NET
libs::
$(SYSINSTALL) $(IFLAGS1) $(srcdir)/effective_tld_names.dat $(DIST)/bin/res
# Generate the include file containing compact, static definitions
# for effective TLD data.
nsEffectiveTLDService.$(OBJ_SUFFIX): etld_data.inc
install::
$(SYSINSTALL) $(IFLAGS1) $(srcdir)/effective_tld_names.dat $(DESTDIR)$(mozappdir)/res
etld_data.inc: $(srcdir)/prepare_tlds.py $(srcdir)/effective_tld_names.dat
$(PYTHON) $(srcdir)/prepare_tlds.py $(srcdir)/effective_tld_names.dat > etld_data.inc

View File

@ -22,6 +22,7 @@
* Contributor(s):
* Pamela Greene <pamg.bugs@gmail.com> (original author)
* Daniel Witte <dwitte@stanford.edu>
* Jeff Walden <jwalden+code@mit.edu>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -42,79 +43,48 @@
// http://wiki.mozilla.org/Gecko:Effective_TLD_Service
#include "nsEffectiveTLDService.h"
#include "nsAppDirectoryServiceDefs.h"
#include "nsDirectoryServiceUtils.h"
#include "nsDirectoryServiceDefs.h"
#include "nsFileStreams.h"
#include "nsIFile.h"
#include "nsIIDNService.h"
#include "nsNetUtil.h"
#include "prnetdb.h"
// The file name of the list of TLD-like names. A file with this name in the
// system "res" directory will always be used. In addition, if a file with
// the same name is present in the user's profile directory, its contents will
// also be used, as though those rules were appended to the system file.
#define EFF_TLD_FILENAME NS_LITERAL_CSTRING("effective_tld_names.dat")
NS_IMPL_ISUPPORTS1(nsEffectiveTLDService, nsIEffectiveTLDService)
// ----------------------------------------------------------------------
#define PL_ARENA_CONST_ALIGN_MASK 3
#include "plarena.h"
static PLArenaPool *gArena = nsnull;
#define ARENA_SIZE 512
// equivalent to strdup() - does no error checking,
// we're assuming we're only called with a valid pointer
static char *
ArenaStrDup(const char* str, PLArenaPool* aArena)
{
void *mem;
PRUint32 size = strlen(str) + 1;
PL_ARENA_ALLOCATE(mem, aArena, size);
if (mem)
memcpy(mem, str, size);
return static_cast<char*>(mem);
}
nsDomainEntry::nsDomainEntry(const char *aDomain)
: mDomain(ArenaStrDup(aDomain, gArena))
, mIsNormal(PR_FALSE)
, mIsException(PR_FALSE)
, mIsWild(PR_FALSE)
{
}
static const ETLDEntry gEntries[] =
#include "etld_data.inc"
;
// ----------------------------------------------------------------------
nsresult
nsEffectiveTLDService::Init()
{
if (!mHash.Init())
// We'll probably have to rehash at least once, since nsTHashtable doesn't
// use a perfect hash, but at least we'll save a few rehashes along the way.
// Next optimization here is to precompute the hash using something like
// gperf, but one step at a time. :-)
if (!mHash.Init(NS_ARRAY_LENGTH(gEntries) - 1))
return NS_ERROR_OUT_OF_MEMORY;
nsresult rv;
mIDNService = do_GetService(NS_IDNSERVICE_CONTRACTID, &rv);
if (NS_FAILED(rv)) return rv;
return LoadEffectiveTLDFiles();
}
nsEffectiveTLDService::nsEffectiveTLDService()
{
}
nsEffectiveTLDService::~nsEffectiveTLDService()
{
if (gArena) {
PL_FinishArenaPool(gArena);
delete gArena;
// Initialize eTLD hash from static array
for (PRUint32 i = 0; i < NS_ARRAY_LENGTH(gEntries) - 1; i++) {
#ifdef DEBUG
nsDependentCString name(gEntries[i].domain);
nsCAutoString normalizedName(gEntries[i].domain);
NS_ASSERTION(NS_SUCCEEDED(NormalizeHostname(normalizedName)),
"normalization failure!");
NS_ASSERTION(name.Equals(normalizedName), "domain not normalized!");
#endif
nsDomainEntry *entry = mHash.PutEntry(gEntries[i].domain);
NS_ENSURE_TRUE(entry, NS_ERROR_OUT_OF_MEMORY);
entry->SetData(&gEntries[i]);
}
gArena = nsnull;
return NS_OK;
}
// External function for dealing with URI's correctly.
@ -212,8 +182,8 @@ nsEffectiveTLDService::GetBaseDomainInternal(nsCString &aHostname,
if (result == PR_SUCCESS)
return NS_ERROR_HOST_IS_IP_ADDRESS;
// walk up the domain tree, most specific to least specific,
// looking for matches at each level. note that a given level may
// Walk up the domain tree, most specific to least specific,
// looking for matches at each level. Note that a given level may
// have multiple attributes (e.g. IsWild() and IsNormal()).
const char *prevDomain = nsnull;
const char *currDomain = aHostname.get();
@ -276,186 +246,18 @@ nsEffectiveTLDService::GetBaseDomainInternal(nsCString &aHostname,
return NS_OK;
}
// Normalizes characters of hostname. ASCII/ACE names are lower-cased,
// and UTF8 names are normalized per RFC 3454 and converted to ACE.
// Normalizes the given hostname, component by component. ASCII/ACE
// components are lower-cased, and UTF-8 components are normalized per
// RFC 3454 and converted to ACE.
nsresult
nsEffectiveTLDService::NormalizeHostname(nsCString &aHostname)
{
if (IsASCII(aHostname)) {
ToLowerCase(aHostname);
return NS_OK;
}
return mIDNService->ConvertUTF8toACE(aHostname, aHostname);
}
// Adds the given domain name rule to the effective-TLD hash.
// CAUTION: As a side effect, the domain name rule will be normalized.
// see NormalizeHostname().
nsresult
nsEffectiveTLDService::AddEffectiveTLDEntry(nsCString &aDomainName)
{
// lazily init the arena pool
if (!gArena) {
gArena = new PLArenaPool;
NS_ENSURE_TRUE(gArena, NS_ERROR_OUT_OF_MEMORY);
PL_INIT_ARENA_POOL(gArena, "eTLDArena", ARENA_SIZE);
}
PRBool isException = PR_FALSE, isWild = PR_FALSE;
// Is this node an exception?
if (aDomainName.First() == '!') {
isException = PR_TRUE;
aDomainName.Cut(0, 1);
// ... or wild?
} else if (StringBeginsWith(aDomainName, NS_LITERAL_CSTRING("*."))) {
isWild = PR_TRUE;
aDomainName.Cut(0, 2);
NS_ASSERTION(!StringBeginsWith(aDomainName, NS_LITERAL_CSTRING("*.")),
"only one wildcard level supported!");
}
// Normalize the domain name.
nsresult rv = NormalizeHostname(aDomainName);
NS_ENSURE_SUCCESS(rv, rv);
nsDomainEntry *entry = mHash.PutEntry(aDomainName.get());
NS_ENSURE_TRUE(entry, NS_ERROR_FAILURE);
// check for arena string alloc failure
if (!entry->GetKey()) {
mHash.RawRemoveEntry(entry);
return NS_ERROR_OUT_OF_MEMORY;
}
// add the new flags, without stomping existing ones
entry->IsWild() |= isWild;
entry->IsException() |= isException;
// note: isWild also implies isNormal (e.g. *.co.nz also implies the co.nz eTLD)
entry->IsNormal() |= isWild || !isException;
return NS_OK;
}
// Locates the effective-TLD file. If aUseProfile is true, uses the file from
// the user's profile directory; otherwise uses the one from the system "res"
// directory. Places nsnull in foundFile if the desired file was not found.
nsresult
LocateEffectiveTLDFile(nsCOMPtr<nsIFile>& foundFile, PRBool aUseProfile)
{
foundFile = nsnull;
nsCOMPtr<nsIFile> effTLDFile = nsnull;
nsresult rv = NS_OK;
PRBool exists = PR_FALSE;
if (aUseProfile) {
// Look for the file in the user's profile directory.
rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR,
getter_AddRefs(effTLDFile));
// We allow a nonfatal error so that this component can be tested in an
// xpcshell with no profile present.
if (!IsASCII(aHostname)) {
nsresult rv = mIDNService->ConvertUTF8toACE(aHostname, aHostname);
if (NS_FAILED(rv))
return rv;
}
else {
// Look for the file in the application "res" directory.
rv = NS_GetSpecialDirectory(NS_OS_CURRENT_PROCESS_DIR,
getter_AddRefs(effTLDFile));
NS_ENSURE_SUCCESS(rv, rv);
rv = effTLDFile->AppendNative(NS_LITERAL_CSTRING("res"));
NS_ENSURE_SUCCESS(rv, rv);
}
rv = effTLDFile->AppendNative(EFF_TLD_FILENAME);
NS_ENSURE_SUCCESS(rv, rv);
rv = effTLDFile->Exists(&exists);
NS_ENSURE_SUCCESS(rv, rv);
if (exists)
foundFile = effTLDFile;
return rv;
}
void
TruncateAtWhitespace(nsCString &aString)
{
// Searching for a space or tab one byte at a time is fine since UTF-8 is a
// superset of 7-bit ASCII.
nsASingleFragmentCString::const_char_iterator begin, iter, end;
aString.BeginReading(begin);
aString.EndReading(end);
for (iter = begin; iter != end; ++iter) {
if (*iter == ' ' || *iter == '\t') {
aString.Truncate(iter - begin);
break;
}
}
}
// Loads the contents of the given effective-TLD file, building the tree as it
// goes.
nsresult
nsEffectiveTLDService::LoadOneEffectiveTLDFile(nsCOMPtr<nsIFile>& effTLDFile)
{
// Open the file as an input stream.
nsCOMPtr<nsIInputStream> fileStream;
nsresult rv = NS_NewLocalFileInputStream(getter_AddRefs(fileStream),
effTLDFile,
0x01, // read-only mode
-1, // all permissions
nsIFileInputStream::CLOSE_ON_EOF);
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr<nsILineInputStream> lineStream = do_QueryInterface(fileStream, &rv);
NS_ENSURE_SUCCESS(rv, rv);
nsCAutoString lineData;
PRBool isMore;
NS_NAMED_LITERAL_CSTRING(kCommentMarker, "//");
while (NS_SUCCEEDED(lineStream->ReadLine(lineData, &isMore)) && isMore) {
if (StringBeginsWith(lineData, kCommentMarker))
continue;
TruncateAtWhitespace(lineData);
if (!lineData.IsEmpty()) {
rv = AddEffectiveTLDEntry(lineData);
NS_WARN_IF_FALSE(NS_SUCCEEDED(rv), "Error adding effective TLD to list");
}
}
ToLowerCase(aHostname);
return NS_OK;
}
// Loads the contents of the system and user effective-TLD files.
nsresult
nsEffectiveTLDService::LoadEffectiveTLDFiles()
{
nsCOMPtr<nsIFile> effTLDFile;
nsresult rv = LocateEffectiveTLDFile(effTLDFile, PR_FALSE);
// If we didn't find any system effective-TLD file, warn but keep trying. We
// can struggle along using the base TLDs.
if (NS_FAILED(rv) || nsnull == effTLDFile) {
NS_WARNING("No effective-TLD file found in system res directory");
}
else {
rv = LoadOneEffectiveTLDFile(effTLDFile);
NS_ENSURE_SUCCESS(rv, rv);
}
rv = LocateEffectiveTLDFile(effTLDFile, PR_TRUE);
// Since the profile copy isn't strictly needed, ignore any errors trying to
// find or read it, in order to allow testing using xpcshell.
if (NS_FAILED(rv) || nsnull == effTLDFile)
return NS_OK;
return LoadOneEffectiveTLDFile(effTLDFile);
}

View File

@ -22,6 +22,7 @@
* Contributor(s):
* Pamela Greene <pamg.bugs@gmail.com> (original author)
* Daniel Witte <dwitte@stanford.edu>
* Jeff Walden <jwalden+code@mit.edu>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -44,7 +45,14 @@
#include "nsCOMPtr.h"
class nsIIDNService;
class nsIFile;
// struct for static data generated from effective_tld_names.dat
struct ETLDEntry {
const char* domain;
PRPackedBool exception;
PRPackedBool wild;
};
// hash entry class
class nsDomainEntry : public PLDHashEntryHdr
@ -54,7 +62,9 @@ public:
typedef const char* KeyType;
typedef const char* KeyTypePointer;
nsDomainEntry(const char* aDomain);
nsDomainEntry(KeyTypePointer aEntry)
{
}
nsDomainEntry(const nsDomainEntry& toCopy)
{
@ -69,12 +79,12 @@ public:
KeyType GetKey() const
{
return mDomain;
return mData->domain;
}
PRBool KeyEquals(KeyTypePointer aKey) const
{
return !strcmp(mDomain, aKey);
return !strcmp(mData->domain, aKey);
}
static KeyTypePointer KeyToPointer(KeyType aKey)
@ -91,15 +101,14 @@ public:
enum { ALLOW_MEMMOVE = PR_TRUE };
PRPackedBool& IsNormal() { return mIsNormal; }
PRPackedBool& IsException() { return mIsException; }
PRPackedBool& IsWild() { return mIsWild; }
void SetData(const ETLDEntry* entry) { mData = entry; }
PRPackedBool IsNormal() { return mData->wild || !mData->exception; }
PRPackedBool IsException() { return mData->exception; }
PRPackedBool IsWild() { return mData->wild; }
private:
const char *mDomain;
PRPackedBool mIsNormal;
PRPackedBool mIsException;
PRPackedBool mIsWild;
const ETLDEntry* mData;
};
class nsEffectiveTLDService : public nsIEffectiveTLDService
@ -108,17 +117,13 @@ public:
NS_DECL_ISUPPORTS
NS_DECL_NSIEFFECTIVETLDSERVICE
nsEffectiveTLDService();
nsEffectiveTLDService() { }
nsresult Init();
private:
nsresult GetBaseDomainInternal(nsCString &aHostname, PRUint32 aAdditionalParts, nsACString &aBaseDomain);
nsresult NormalizeHostname(nsCString &aHostname);
nsresult AddEffectiveTLDEntry(nsCString &aDomainName);
nsresult LoadEffectiveTLDFiles();
nsresult LoadOneEffectiveTLDFile(nsCOMPtr<nsIFile>& effTLDFile);
virtual ~nsEffectiveTLDService();
~nsEffectiveTLDService() { }
nsTHashtable<nsDomainEntry> mHash;
nsCOMPtr<nsIIDNService> mIDNService;

View File

@ -0,0 +1,152 @@
# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Effective TLD conversion code.
#
# The Initial Developer of the Original Code is
# Jeff Walden <jwalden+code@mit.edu>.
# Portions created by the Initial Developer are Copyright (C) 2008
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****
import codecs
import encodings.idna
import re
import sets
import sys
"""
Processes a file containing effective TLD data. See the following URL for a
description of effective TLDs and of the file format that this script
processes (although for the latter you're better off just reading this file's
short source code).
http://wiki.mozilla.org/Gecko:Effective_TLD_Service
"""
def getEffectiveTLDs(path):
file = codecs.open(path, "r", "UTF-8")
domains = sets.Set()
while True:
line = file.readline()
# line always contains a line terminator unless the file is empty
if len(line) == 0:
raise StopIteration
line = line.rstrip()
# comment, empty, or superfluous line for explicitness purposes
if line.startswith("//") or "." not in line:
continue
line = re.split(r"[ \t\n]", line, 1)[0]
entry = EffectiveTLDEntry(line)
domain = entry.domain()
assert domain not in domains, \
"repeating domain %s makes no sense" % domain
domains.add(domain)
yield entry
def _normalizeHostname(domain):
"""
Normalizes the given domain, component by component. ASCII components are
lowercased, while non-ASCII components are processed using the ToASCII
algorithm.
"""
def convertLabel(label):
if _isASCII(label):
return label.lower()
return encodings.idna.ToASCII(label)
return ".".join(map(convertLabel, domain.split(".")))
def _isASCII(s):
"True if s consists entirely of ASCII characters, false otherwise."
for c in s:
if ord(c) > 127:
return False
return True
class EffectiveTLDEntry:
"""
Stores an entry in an effective-TLD name file.
"""
_exception = False
_wild = False
def __init__(self, line):
"""
Creates a TLD entry from a line of data, which must have been stripped of
the line ending.
"""
if line.startswith("!"):
self._exception = True
domain = line[1:]
elif line.startswith("*."):
self._wild = True
domain = line[2:]
else:
domain = line
self._domain = _normalizeHostname(domain)
def domain(self):
"The domain this represents."
return self._domain
def exception(self):
"True if this entry's domain denotes does not denote an effective TLD."
return self._exception
def wild(self):
"True if this entry represents a class of effective TLDs."
return self._wild
#################
# DO EVERYTHING #
#################
def main():
"""
argv[1] is the effective TLD file to parse.
A C++ array of { domain, exception, wild } entries representing the
eTLD file is then printed to stdout.
"""
def boolStr(b):
if b:
return "PR_TRUE"
return "PR_FALSE"
print "{"
for etld in getEffectiveTLDs(sys.argv[1]):
exception = boolStr(etld.exception())
wild = boolStr(etld.wild())
print ' { "%s", %s, %s },' % (etld.domain(), exception, wild)
print " { nsnull, PR_FALSE, PR_FALSE }"
print "}"
if __name__ == '__main__':
main()

View File

@ -0,0 +1,62 @@
const Cc = Components.classes;
const Ci = Components.interfaces;
const PR_RDONLY = 0x1;
var etld = Cc["@mozilla.org/network/effective-tld-service;1"]
.getService(Ci.nsIEffectiveTLDService);
var idn = Cc["@mozilla.org/network/idn-service;1"]
.getService(Ci.nsIIDNService);
function run_test()
{
var fis = Cc["@mozilla.org/network/file-input-stream;1"]
.createInstance(Ci.nsIFileInputStream);
fis.init(do_get_file("netwerk/dns/src/effective_tld_names.dat"),
PR_RDONLY, 0444, Ci.nsIFileInputStream.CLOSE_ON_EOF);
var lis = Cc["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Ci.nsIConverterInputStream);
lis.init(fis, "UTF-8", 1024, 0);
lis.QueryInterface(Ci.nsIUnicharLineInputStream);
var out = { value: "" };
do
{
var more = lis.readLine(out);
var line = out.value;
var firstTwo = line.substring(0, 2); // a misnomer, but whatever
if (firstTwo == "" || firstTwo == "//")
continue;
var space = line.search(/[ \t]/);
line = line.substring(0, space == -1 ? line.length : space);
if ("*." == firstTwo)
{
let (rest = line.substring(2))
{
checkPublicSuffix("foo.SUPER-SPECIAL-AWESOME-PREFIX." + rest,
"SUPER-SPECIAL-AWESOME-PREFIX." + rest);
}
}
else if ("!" == line.charAt(0))
{
checkPublicSuffix(line.substring(1),
line.substring(line.indexOf(".") + 1));
}
else
{
checkPublicSuffix("SUPER-SPECIAL-AWESOME-PREFIX." + line, line);
}
}
while (more);
}
function checkPublicSuffix(host, expectedSuffix)
{
expectedSuffix = idn.convertUTF8toACE(expectedSuffix).toLowerCase();
var actualSuffix = etld.getPublicSuffixFromHost(host);
do_check_eq(actualSuffix, expectedSuffix);
}