From fe3afbf60ddfe5968a2c1f79b9f47ddcf6d36fbc Mon Sep 17 00:00:00 2001 From: "jwalden%mit.edu" Date: Thu, 14 Feb 2008 22:57:21 +0000 Subject: [PATCH] Bug 414122 - Preprocess effective TLD data into C++ code (eliminate file I/O to read, move parsing out of C++, remove an arena, etc.). r=dwitte, sr=bsmedberg, a=blocker git-svn-id: svn://10.0.0.236/trunk@245738 18797224-902f-48f8-a5cc-f745e15eee43 --- mozilla/netwerk/dns/src/Makefile.in | 11 +- .../netwerk/dns/src/nsEffectiveTLDService.cpp | 258 ++---------------- .../netwerk/dns/src/nsEffectiveTLDService.h | 39 +-- mozilla/netwerk/dns/src/prepare_tlds.py | 152 +++++++++++ mozilla/netwerk/test/unit/test_bug414122.js | 62 +++++ 5 files changed, 273 insertions(+), 249 deletions(-) create mode 100644 mozilla/netwerk/dns/src/prepare_tlds.py create mode 100644 mozilla/netwerk/test/unit/test_bug414122.js diff --git a/mozilla/netwerk/dns/src/Makefile.in b/mozilla/netwerk/dns/src/Makefile.in index bef9d67e63f..b79ff7dcb02 100644 --- a/mozilla/netwerk/dns/src/Makefile.in +++ b/mozilla/netwerk/dns/src/Makefile.in @@ -68,16 +68,19 @@ CSRCS = race.c \ # static lib. FORCE_STATIC_LIB = 1 +# need to include etld_data.inc LOCAL_INCLUDES = \ -I$(srcdir)/../../base/src \ + -I. \ $(NULL) include $(topsrcdir)/config/rules.mk DEFINES += -DIMPL_NS_NET -libs:: - $(SYSINSTALL) $(IFLAGS1) $(srcdir)/effective_tld_names.dat $(DIST)/bin/res +# Generate the include file containing compact, static definitions +# for effective TLD data. +nsEffectiveTLDService.$(OBJ_SUFFIX): etld_data.inc -install:: - $(SYSINSTALL) $(IFLAGS1) $(srcdir)/effective_tld_names.dat $(DESTDIR)$(mozappdir)/res +etld_data.inc: $(srcdir)/prepare_tlds.py $(srcdir)/effective_tld_names.dat + $(PYTHON) $(srcdir)/prepare_tlds.py $(srcdir)/effective_tld_names.dat > etld_data.inc diff --git a/mozilla/netwerk/dns/src/nsEffectiveTLDService.cpp b/mozilla/netwerk/dns/src/nsEffectiveTLDService.cpp index 2e56e7466fc..07d4456bd07 100644 --- a/mozilla/netwerk/dns/src/nsEffectiveTLDService.cpp +++ b/mozilla/netwerk/dns/src/nsEffectiveTLDService.cpp @@ -22,6 +22,7 @@ * Contributor(s): * Pamela Greene (original author) * Daniel Witte + * Jeff Walden * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -42,79 +43,48 @@ // http://wiki.mozilla.org/Gecko:Effective_TLD_Service #include "nsEffectiveTLDService.h" -#include "nsAppDirectoryServiceDefs.h" -#include "nsDirectoryServiceUtils.h" -#include "nsDirectoryServiceDefs.h" -#include "nsFileStreams.h" -#include "nsIFile.h" #include "nsIIDNService.h" #include "nsNetUtil.h" #include "prnetdb.h" -// The file name of the list of TLD-like names. A file with this name in the -// system "res" directory will always be used. In addition, if a file with -// the same name is present in the user's profile directory, its contents will -// also be used, as though those rules were appended to the system file. -#define EFF_TLD_FILENAME NS_LITERAL_CSTRING("effective_tld_names.dat") - NS_IMPL_ISUPPORTS1(nsEffectiveTLDService, nsIEffectiveTLDService) // ---------------------------------------------------------------------- -#define PL_ARENA_CONST_ALIGN_MASK 3 -#include "plarena.h" - -static PLArenaPool *gArena = nsnull; - -#define ARENA_SIZE 512 - -// equivalent to strdup() - does no error checking, -// we're assuming we're only called with a valid pointer -static char * -ArenaStrDup(const char* str, PLArenaPool* aArena) -{ - void *mem; - PRUint32 size = strlen(str) + 1; - PL_ARENA_ALLOCATE(mem, aArena, size); - if (mem) - memcpy(mem, str, size); - return static_cast(mem); -} - -nsDomainEntry::nsDomainEntry(const char *aDomain) - : mDomain(ArenaStrDup(aDomain, gArena)) - , mIsNormal(PR_FALSE) - , mIsException(PR_FALSE) - , mIsWild(PR_FALSE) -{ -} +static const ETLDEntry gEntries[] = +#include "etld_data.inc" +; // ---------------------------------------------------------------------- nsresult nsEffectiveTLDService::Init() { - if (!mHash.Init()) + // We'll probably have to rehash at least once, since nsTHashtable doesn't + // use a perfect hash, but at least we'll save a few rehashes along the way. + // Next optimization here is to precompute the hash using something like + // gperf, but one step at a time. :-) + if (!mHash.Init(NS_ARRAY_LENGTH(gEntries) - 1)) return NS_ERROR_OUT_OF_MEMORY; nsresult rv; mIDNService = do_GetService(NS_IDNSERVICE_CONTRACTID, &rv); if (NS_FAILED(rv)) return rv; - return LoadEffectiveTLDFiles(); -} - -nsEffectiveTLDService::nsEffectiveTLDService() -{ -} - -nsEffectiveTLDService::~nsEffectiveTLDService() -{ - if (gArena) { - PL_FinishArenaPool(gArena); - delete gArena; + // Initialize eTLD hash from static array + for (PRUint32 i = 0; i < NS_ARRAY_LENGTH(gEntries) - 1; i++) { +#ifdef DEBUG + nsDependentCString name(gEntries[i].domain); + nsCAutoString normalizedName(gEntries[i].domain); + NS_ASSERTION(NS_SUCCEEDED(NormalizeHostname(normalizedName)), + "normalization failure!"); + NS_ASSERTION(name.Equals(normalizedName), "domain not normalized!"); +#endif + nsDomainEntry *entry = mHash.PutEntry(gEntries[i].domain); + NS_ENSURE_TRUE(entry, NS_ERROR_OUT_OF_MEMORY); + entry->SetData(&gEntries[i]); } - gArena = nsnull; + return NS_OK; } // External function for dealing with URI's correctly. @@ -212,8 +182,8 @@ nsEffectiveTLDService::GetBaseDomainInternal(nsCString &aHostname, if (result == PR_SUCCESS) return NS_ERROR_HOST_IS_IP_ADDRESS; - // walk up the domain tree, most specific to least specific, - // looking for matches at each level. note that a given level may + // Walk up the domain tree, most specific to least specific, + // looking for matches at each level. Note that a given level may // have multiple attributes (e.g. IsWild() and IsNormal()). const char *prevDomain = nsnull; const char *currDomain = aHostname.get(); @@ -276,186 +246,18 @@ nsEffectiveTLDService::GetBaseDomainInternal(nsCString &aHostname, return NS_OK; } -// Normalizes characters of hostname. ASCII/ACE names are lower-cased, -// and UTF8 names are normalized per RFC 3454 and converted to ACE. +// Normalizes the given hostname, component by component. ASCII/ACE +// components are lower-cased, and UTF-8 components are normalized per +// RFC 3454 and converted to ACE. nsresult nsEffectiveTLDService::NormalizeHostname(nsCString &aHostname) { - if (IsASCII(aHostname)) { - ToLowerCase(aHostname); - return NS_OK; - } - - return mIDNService->ConvertUTF8toACE(aHostname, aHostname); -} - -// Adds the given domain name rule to the effective-TLD hash. -// CAUTION: As a side effect, the domain name rule will be normalized. -// see NormalizeHostname(). -nsresult -nsEffectiveTLDService::AddEffectiveTLDEntry(nsCString &aDomainName) -{ - // lazily init the arena pool - if (!gArena) { - gArena = new PLArenaPool; - NS_ENSURE_TRUE(gArena, NS_ERROR_OUT_OF_MEMORY); - PL_INIT_ARENA_POOL(gArena, "eTLDArena", ARENA_SIZE); - } - - PRBool isException = PR_FALSE, isWild = PR_FALSE; - - // Is this node an exception? - if (aDomainName.First() == '!') { - isException = PR_TRUE; - aDomainName.Cut(0, 1); - - // ... or wild? - } else if (StringBeginsWith(aDomainName, NS_LITERAL_CSTRING("*."))) { - isWild = PR_TRUE; - aDomainName.Cut(0, 2); - - NS_ASSERTION(!StringBeginsWith(aDomainName, NS_LITERAL_CSTRING("*.")), - "only one wildcard level supported!"); - } - - // Normalize the domain name. - nsresult rv = NormalizeHostname(aDomainName); - NS_ENSURE_SUCCESS(rv, rv); - - nsDomainEntry *entry = mHash.PutEntry(aDomainName.get()); - NS_ENSURE_TRUE(entry, NS_ERROR_FAILURE); - - // check for arena string alloc failure - if (!entry->GetKey()) { - mHash.RawRemoveEntry(entry); - return NS_ERROR_OUT_OF_MEMORY; - } - - // add the new flags, without stomping existing ones - entry->IsWild() |= isWild; - entry->IsException() |= isException; - // note: isWild also implies isNormal (e.g. *.co.nz also implies the co.nz eTLD) - entry->IsNormal() |= isWild || !isException; - - return NS_OK; -} - -// Locates the effective-TLD file. If aUseProfile is true, uses the file from -// the user's profile directory; otherwise uses the one from the system "res" -// directory. Places nsnull in foundFile if the desired file was not found. -nsresult -LocateEffectiveTLDFile(nsCOMPtr& foundFile, PRBool aUseProfile) -{ - foundFile = nsnull; - - nsCOMPtr effTLDFile = nsnull; - nsresult rv = NS_OK; - PRBool exists = PR_FALSE; - if (aUseProfile) { - // Look for the file in the user's profile directory. - rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, - getter_AddRefs(effTLDFile)); - // We allow a nonfatal error so that this component can be tested in an - // xpcshell with no profile present. + if (!IsASCII(aHostname)) { + nsresult rv = mIDNService->ConvertUTF8toACE(aHostname, aHostname); if (NS_FAILED(rv)) return rv; } - else { - // Look for the file in the application "res" directory. - rv = NS_GetSpecialDirectory(NS_OS_CURRENT_PROCESS_DIR, - getter_AddRefs(effTLDFile)); - NS_ENSURE_SUCCESS(rv, rv); - - rv = effTLDFile->AppendNative(NS_LITERAL_CSTRING("res")); - NS_ENSURE_SUCCESS(rv, rv); - } - - rv = effTLDFile->AppendNative(EFF_TLD_FILENAME); - NS_ENSURE_SUCCESS(rv, rv); - - rv = effTLDFile->Exists(&exists); - NS_ENSURE_SUCCESS(rv, rv); - - if (exists) - foundFile = effTLDFile; - - return rv; -} - -void -TruncateAtWhitespace(nsCString &aString) -{ - // Searching for a space or tab one byte at a time is fine since UTF-8 is a - // superset of 7-bit ASCII. - nsASingleFragmentCString::const_char_iterator begin, iter, end; - aString.BeginReading(begin); - aString.EndReading(end); - - for (iter = begin; iter != end; ++iter) { - if (*iter == ' ' || *iter == '\t') { - aString.Truncate(iter - begin); - break; - } - } -} - -// Loads the contents of the given effective-TLD file, building the tree as it -// goes. -nsresult -nsEffectiveTLDService::LoadOneEffectiveTLDFile(nsCOMPtr& effTLDFile) -{ - // Open the file as an input stream. - nsCOMPtr fileStream; - nsresult rv = NS_NewLocalFileInputStream(getter_AddRefs(fileStream), - effTLDFile, - 0x01, // read-only mode - -1, // all permissions - nsIFileInputStream::CLOSE_ON_EOF); - NS_ENSURE_SUCCESS(rv, rv); - - nsCOMPtr lineStream = do_QueryInterface(fileStream, &rv); - NS_ENSURE_SUCCESS(rv, rv); - - nsCAutoString lineData; - PRBool isMore; - NS_NAMED_LITERAL_CSTRING(kCommentMarker, "//"); - while (NS_SUCCEEDED(lineStream->ReadLine(lineData, &isMore)) && isMore) { - if (StringBeginsWith(lineData, kCommentMarker)) - continue; - - TruncateAtWhitespace(lineData); - if (!lineData.IsEmpty()) { - rv = AddEffectiveTLDEntry(lineData); - NS_WARN_IF_FALSE(NS_SUCCEEDED(rv), "Error adding effective TLD to list"); - } - } + ToLowerCase(aHostname); return NS_OK; } - -// Loads the contents of the system and user effective-TLD files. -nsresult -nsEffectiveTLDService::LoadEffectiveTLDFiles() -{ - nsCOMPtr effTLDFile; - nsresult rv = LocateEffectiveTLDFile(effTLDFile, PR_FALSE); - - // If we didn't find any system effective-TLD file, warn but keep trying. We - // can struggle along using the base TLDs. - if (NS_FAILED(rv) || nsnull == effTLDFile) { - NS_WARNING("No effective-TLD file found in system res directory"); - } - else { - rv = LoadOneEffectiveTLDFile(effTLDFile); - NS_ENSURE_SUCCESS(rv, rv); - } - - rv = LocateEffectiveTLDFile(effTLDFile, PR_TRUE); - - // Since the profile copy isn't strictly needed, ignore any errors trying to - // find or read it, in order to allow testing using xpcshell. - if (NS_FAILED(rv) || nsnull == effTLDFile) - return NS_OK; - - return LoadOneEffectiveTLDFile(effTLDFile); -} diff --git a/mozilla/netwerk/dns/src/nsEffectiveTLDService.h b/mozilla/netwerk/dns/src/nsEffectiveTLDService.h index e381abc925e..e29733e1186 100644 --- a/mozilla/netwerk/dns/src/nsEffectiveTLDService.h +++ b/mozilla/netwerk/dns/src/nsEffectiveTLDService.h @@ -22,6 +22,7 @@ * Contributor(s): * Pamela Greene (original author) * Daniel Witte + * Jeff Walden * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -44,7 +45,14 @@ #include "nsCOMPtr.h" class nsIIDNService; -class nsIFile; + +// struct for static data generated from effective_tld_names.dat +struct ETLDEntry { + const char* domain; + PRPackedBool exception; + PRPackedBool wild; +}; + // hash entry class class nsDomainEntry : public PLDHashEntryHdr @@ -54,7 +62,9 @@ public: typedef const char* KeyType; typedef const char* KeyTypePointer; - nsDomainEntry(const char* aDomain); + nsDomainEntry(KeyTypePointer aEntry) + { + } nsDomainEntry(const nsDomainEntry& toCopy) { @@ -69,12 +79,12 @@ public: KeyType GetKey() const { - return mDomain; + return mData->domain; } PRBool KeyEquals(KeyTypePointer aKey) const { - return !strcmp(mDomain, aKey); + return !strcmp(mData->domain, aKey); } static KeyTypePointer KeyToPointer(KeyType aKey) @@ -91,15 +101,14 @@ public: enum { ALLOW_MEMMOVE = PR_TRUE }; - PRPackedBool& IsNormal() { return mIsNormal; } - PRPackedBool& IsException() { return mIsException; } - PRPackedBool& IsWild() { return mIsWild; } + void SetData(const ETLDEntry* entry) { mData = entry; } + + PRPackedBool IsNormal() { return mData->wild || !mData->exception; } + PRPackedBool IsException() { return mData->exception; } + PRPackedBool IsWild() { return mData->wild; } private: - const char *mDomain; - PRPackedBool mIsNormal; - PRPackedBool mIsException; - PRPackedBool mIsWild; + const ETLDEntry* mData; }; class nsEffectiveTLDService : public nsIEffectiveTLDService @@ -108,17 +117,13 @@ public: NS_DECL_ISUPPORTS NS_DECL_NSIEFFECTIVETLDSERVICE - nsEffectiveTLDService(); + nsEffectiveTLDService() { } nsresult Init(); private: nsresult GetBaseDomainInternal(nsCString &aHostname, PRUint32 aAdditionalParts, nsACString &aBaseDomain); nsresult NormalizeHostname(nsCString &aHostname); - nsresult AddEffectiveTLDEntry(nsCString &aDomainName); - nsresult LoadEffectiveTLDFiles(); - nsresult LoadOneEffectiveTLDFile(nsCOMPtr& effTLDFile); - - virtual ~nsEffectiveTLDService(); + ~nsEffectiveTLDService() { } nsTHashtable mHash; nsCOMPtr mIDNService; diff --git a/mozilla/netwerk/dns/src/prepare_tlds.py b/mozilla/netwerk/dns/src/prepare_tlds.py new file mode 100644 index 00000000000..1241c3369c1 --- /dev/null +++ b/mozilla/netwerk/dns/src/prepare_tlds.py @@ -0,0 +1,152 @@ +# ***** BEGIN LICENSE BLOCK ***** +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Effective TLD conversion code. +# +# The Initial Developer of the Original Code is +# Jeff Walden . +# Portions created by the Initial Developer are Copyright (C) 2008 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ***** END LICENSE BLOCK ***** + +import codecs +import encodings.idna +import re +import sets +import sys + +""" +Processes a file containing effective TLD data. See the following URL for a +description of effective TLDs and of the file format that this script +processes (although for the latter you're better off just reading this file's +short source code). + +http://wiki.mozilla.org/Gecko:Effective_TLD_Service +""" + +def getEffectiveTLDs(path): + file = codecs.open(path, "r", "UTF-8") + domains = sets.Set() + while True: + line = file.readline() + # line always contains a line terminator unless the file is empty + if len(line) == 0: + raise StopIteration + line = line.rstrip() + # comment, empty, or superfluous line for explicitness purposes + if line.startswith("//") or "." not in line: + continue + line = re.split(r"[ \t\n]", line, 1)[0] + entry = EffectiveTLDEntry(line) + domain = entry.domain() + assert domain not in domains, \ + "repeating domain %s makes no sense" % domain + domains.add(domain) + yield entry + +def _normalizeHostname(domain): + """ + Normalizes the given domain, component by component. ASCII components are + lowercased, while non-ASCII components are processed using the ToASCII + algorithm. + """ + def convertLabel(label): + if _isASCII(label): + return label.lower() + return encodings.idna.ToASCII(label) + return ".".join(map(convertLabel, domain.split("."))) + +def _isASCII(s): + "True if s consists entirely of ASCII characters, false otherwise." + for c in s: + if ord(c) > 127: + return False + return True + +class EffectiveTLDEntry: + """ + Stores an entry in an effective-TLD name file. + """ + + _exception = False + _wild = False + + def __init__(self, line): + """ + Creates a TLD entry from a line of data, which must have been stripped of + the line ending. + """ + if line.startswith("!"): + self._exception = True + domain = line[1:] + elif line.startswith("*."): + self._wild = True + domain = line[2:] + else: + domain = line + self._domain = _normalizeHostname(domain) + + def domain(self): + "The domain this represents." + return self._domain + + def exception(self): + "True if this entry's domain denotes does not denote an effective TLD." + return self._exception + + def wild(self): + "True if this entry represents a class of effective TLDs." + return self._wild + + +################# +# DO EVERYTHING # +################# + +def main(): + """ + argv[1] is the effective TLD file to parse. + A C++ array of { domain, exception, wild } entries representing the + eTLD file is then printed to stdout. + """ + + def boolStr(b): + if b: + return "PR_TRUE" + return "PR_FALSE" + + print "{" + for etld in getEffectiveTLDs(sys.argv[1]): + exception = boolStr(etld.exception()) + wild = boolStr(etld.wild()) + print ' { "%s", %s, %s },' % (etld.domain(), exception, wild) + print " { nsnull, PR_FALSE, PR_FALSE }" + print "}" + +if __name__ == '__main__': + main() diff --git a/mozilla/netwerk/test/unit/test_bug414122.js b/mozilla/netwerk/test/unit/test_bug414122.js new file mode 100644 index 00000000000..3db8ee9b789 --- /dev/null +++ b/mozilla/netwerk/test/unit/test_bug414122.js @@ -0,0 +1,62 @@ +const Cc = Components.classes; +const Ci = Components.interfaces; + +const PR_RDONLY = 0x1; + +var etld = Cc["@mozilla.org/network/effective-tld-service;1"] + .getService(Ci.nsIEffectiveTLDService); +var idn = Cc["@mozilla.org/network/idn-service;1"] + .getService(Ci.nsIIDNService); + +function run_test() +{ + var fis = Cc["@mozilla.org/network/file-input-stream;1"] + .createInstance(Ci.nsIFileInputStream); + fis.init(do_get_file("netwerk/dns/src/effective_tld_names.dat"), + PR_RDONLY, 0444, Ci.nsIFileInputStream.CLOSE_ON_EOF); + + var lis = Cc["@mozilla.org/intl/converter-input-stream;1"] + .createInstance(Ci.nsIConverterInputStream); + lis.init(fis, "UTF-8", 1024, 0); + lis.QueryInterface(Ci.nsIUnicharLineInputStream); + + var out = { value: "" }; + do + { + var more = lis.readLine(out); + var line = out.value; + + var firstTwo = line.substring(0, 2); // a misnomer, but whatever + if (firstTwo == "" || firstTwo == "//") + continue; + + var space = line.search(/[ \t]/); + line = line.substring(0, space == -1 ? line.length : space); + + if ("*." == firstTwo) + { + let (rest = line.substring(2)) + { + checkPublicSuffix("foo.SUPER-SPECIAL-AWESOME-PREFIX." + rest, + "SUPER-SPECIAL-AWESOME-PREFIX." + rest); + } + } + else if ("!" == line.charAt(0)) + { + checkPublicSuffix(line.substring(1), + line.substring(line.indexOf(".") + 1)); + } + else + { + checkPublicSuffix("SUPER-SPECIAL-AWESOME-PREFIX." + line, line); + } + } + while (more); +} + +function checkPublicSuffix(host, expectedSuffix) +{ + expectedSuffix = idn.convertUTF8toACE(expectedSuffix).toLowerCase(); + var actualSuffix = etld.getPublicSuffixFromHost(host); + do_check_eq(actualSuffix, expectedSuffix); +}