Mozilla/mozilla/content/base/src/mozSanitizingSerializer.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: NPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Netscape Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org HTML Sanitizer code.
 *
 * The Initial Developer of the Original Code is
 * Ben Bucksch <mozilla@bucksch.org>.
 * Portions created by the Initial Developer are Copyright (C) 2002
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *     Netscape
 *
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the NPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the NPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

/* I used nsPlaintextSerializer as base for this class. I don't understand
   all of the functions in the beginning. Possible that I fail to do
   something or do something useless.
   I am not proud about the implementation here at all.
   Feel free to fix it :-).

   I am moderately concerned about methods to obfuscate HTML, which the
   parser can decode during execution.
   E.g. there are these dreaded data: and javascript URLs and
   base64 encoding (which I don't really understand how it alloies
*/

#include "mozSanitizingSerializer.h"
#include "nsIServiceManager.h"
#include "nsHTMLAtoms.h"
#include "nsIDOMText.h"
#include "nsIDOMElement.h"
#include "nsINameSpaceManager.h"
#include "nsIHTMLContent.h"
#include "nsITextContent.h"
#include "nsTextFragment.h"
#include "nsParserCIID.h"
#include "nsContentUtils.h"
#include "nsReadableUtils.h"
#include "plstr.h"
//#include "nsDependentString.h"
#include "nsIProperties.h"

//#define DEBUG_BenB

/* XXX: |printf|s in some error conditions. They are intended as information
   for the user, because they complain about malformed pref values.
   Not sure, if popping up dialog boxes is the right thing for such code
   (and if so, how to do it).
 */

static NS_DEFINE_CID(kParserServiceCID, NS_PARSERSERVICE_CID);

#define TEXT_REMOVED "&lt;Text removed&gt;"
#define TEXT_BREAKER "|"

nsresult NS_NewSanitizingHTMLSerializer(nsIContentSerializer** aSerializer)
{
  mozSanitizingHTMLSerializer* it = new mozSanitizingHTMLSerializer();
  if (!it) {
    return NS_ERROR_OUT_OF_MEMORY;
  }
  NS_ADDREF(it);
  *aSerializer = it;
  return NS_OK;
}

mozSanitizingHTMLSerializer::mozSanitizingHTMLSerializer()
  : mAllowedTags(30) // Just some initial buffer size
{
  NS_INIT_ISUPPORTS();

  mOutputString = nsnull;
}

mozSanitizingHTMLSerializer::~mozSanitizingHTMLSerializer()
{
  mAllowedTags.Enumerate(ReleaseProperties);
}

//<copied from="xpcom/ds/nsProperties.cpp">
PRBool PR_CALLBACK
mozSanitizingHTMLSerializer::ReleaseProperties(nsHashKey* key, void* data,
                                               void* closure)
{
  nsIProperties* prop = (nsIProperties*)data;
  NS_IF_RELEASE(prop);
  return PR_TRUE;
}
//</copied>

NS_IMPL_ISUPPORTS4(mozSanitizingHTMLSerializer,
                   nsIContentSerializer,
                   nsIContentSink,
                   nsIHTMLContentSink,
                   mozISanitizingHTMLSerializer)


NS_IMETHODIMP
mozSanitizingHTMLSerializer::Init(PRUint32 aFlags, PRUint32 dummy,
                                  nsIAtom* aCharSet, PRBool aIsCopying)
{
  return NS_OK;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::Initialize(nsAString* aOutString,
                                        PRUint32 aFlags,
                                        const nsAString& allowedTags)
{
  nsresult rv = Init(aFlags, 0, nsnull, PR_FALSE);
  NS_ENSURE_SUCCESS(rv, rv);

  // XXX This is wrong. It violates XPCOM string ownership rules.
  // We're only getting away with this because instances of this
  // class are restricted to single function scope.
  // (Comment copied from nsPlaintextSerializer)
  mOutputString = aOutString;

  ParsePrefs(allowedTags);

  return NS_OK;
}

// This is not used within the class, but maybe called from somewhere else?
NS_IMETHODIMP
mozSanitizingHTMLSerializer::Flush(nsAString& aStr)
{
#if DEBUG_BenB
  printf("Flush: -%s-", NS_LossyConvertUCS2toASCII(aString).get());
#endif
  Write(aStr);
  return NS_OK;
}

void
mozSanitizingHTMLSerializer::Write(const nsAString& aString)
{
  //  printf("%s", NS_LossyConvertUCS2toASCII(aString).get());
  mOutputString->Append(aString);
}


NS_IMETHODIMP
mozSanitizingHTMLSerializer::GetPref(PRInt32 aTag, PRBool& aPref)
{
  aPref = PR_FALSE;

  return NS_OK;
}

nsresult
mozSanitizingHTMLSerializer::GetParserService(
                                          nsIParserService** aParserService)
{
  if (!mParserService) {
    nsresult rv;
    mParserService = do_GetService(kParserServiceCID, &rv);
    if (NS_FAILED(rv))
      return rv;
  }

  *aParserService = mParserService;
  NS_ADDREF(*aParserService);
  return NS_OK;
}


/**
 * Returns true, if the id represents a container
 */
PRBool
mozSanitizingHTMLSerializer::IsContainer(PRInt32 aId)
{
  PRBool isContainer = PR_FALSE;

  nsCOMPtr<nsIParserService> parserService;
  GetParserService(getter_AddRefs(parserService));
  if (parserService) {
    parserService->IsContainer(aId, isContainer);
  }

  return isContainer;
}


/* XXX I don't really know, what these functions do, but they seem to be
   needed ;-). Mostly copied from nsPlaintextSerializer. */
/* akk says:
   "I wonder if the sanitizing class could inherit from nsHTMLSerializer,
   so that at least these methods that none of us understand only have to be
   written once?" */

nsresult
mozSanitizingHTMLSerializer::GetIdForContent(nsIContent* aContent,
                                             PRInt32* aID)
{
  nsCOMPtr<nsIHTMLContent> htmlcontent = do_QueryInterface(aContent);
  if (!htmlcontent) {
    *aID = eHTMLTag_unknown;
    return NS_OK;
  }

  nsCOMPtr<nsIAtom> tagname;
  mContent->GetTag(*getter_AddRefs(tagname));
  if (!tagname)
    return NS_ERROR_FAILURE;

  nsresult rv;
  nsCOMPtr<nsIParserService> parserService;
  rv = GetParserService(getter_AddRefs(parserService));
  if (NS_FAILED(rv))
    return rv;

  rv = parserService->HTMLAtomTagToId(tagname, aID);
  if (NS_FAILED(rv))
    return rv;

  return NS_OK;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::AppendText(nsIDOMText* aText,
                                        PRInt32 aStartOffset,
                                        PRInt32 aEndOffset,
                                        nsAString& aStr)
{
  nsresult rv = NS_OK;

  mOutputString = &aStr;

  nsAutoString linebuffer;
  rv = DoAddLeaf(eHTMLTag_text, linebuffer);

  return rv;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::AppendElementStart(nsIDOMElement *aElement,
                                                PRBool aHasChildren,
                                                nsAString& aStr)
{
  NS_ENSURE_ARG(aElement);

  mContent = do_QueryInterface(aElement);
  if (!mContent) return NS_ERROR_FAILURE;

  mOutputString = &aStr;

  nsresult rv;
  PRInt32 id;
  rv = GetIdForContent(mContent, &id);
  if (NS_FAILED(rv)) return rv;

  PRBool isContainer = IsContainer(id);

  if (isContainer) {
    rv = DoOpenContainer(id);
  }
  else {
    nsAutoString empty;
    rv = DoAddLeaf(id, empty);
  }

  mContent = 0;
  mOutputString = nsnull;

  return rv;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::AppendElementEnd(nsIDOMElement *aElement,
                                              nsAString& aStr)
{
  NS_ENSURE_ARG(aElement);

  mContent = do_QueryInterface(aElement);
  if (!mContent) return NS_ERROR_FAILURE;

  mOutputString = &aStr;

  nsresult rv;
  PRInt32 id;
  rv = GetIdForContent(mContent, &id);
  if (NS_FAILED(rv)) return rv;

  PRBool isContainer = IsContainer(id);

  if (isContainer) {
    rv = DoCloseContainer(id);
  }

  mContent = 0;
  mOutputString = nsnull;

  return rv;
}


NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenContainer(const nsIParserNode& aNode)
{
  PRInt32 type = aNode.GetNodeType();

  mParserNode = NS_CONST_CAST(nsIParserNode *, &aNode);
  return DoOpenContainer(type);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseContainer(const nsIParserNode& aNode)
{
  PRInt32 type = aNode.GetNodeType();
  const nsAString& namestr = aNode.GetText();
  nsCOMPtr<nsIAtom> name = dont_AddRef(NS_NewAtom(namestr));

  mParserNode = NS_CONST_CAST(nsIParserNode*, &aNode);
  return DoCloseContainer(type);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::AddLeaf(const nsIParserNode& aNode)
{
  eHTMLTags type = (eHTMLTags)aNode.GetNodeType();
  const nsAString& text = aNode.GetText();

  mParserNode = NS_CONST_CAST(nsIParserNode*, &aNode);
  return DoAddLeaf(type, text);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenHTML(const nsIParserNode& aNode)
{
  return OpenContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseHTML(const nsIParserNode& aNode)
{
  return CloseContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::SetTitle(const nsString& aValue)
{
  if (IsAllowedTag(eHTMLTag_title))
  {
    //Write(NS_LITERAL_STRING("<title>"));  -- aValue already contains that
    Write(aValue);
    //Write(NS_LITERAL_STRING("</title>"));
  }
  return NS_OK;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::AddDocTypeDecl(const nsIParserNode& aNode)
{
  return NS_OK;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::SetDocumentCharset(nsAString& aCharset)
{
  // No idea, if this works - it isn't invoked by |TestOutput|.
  Write(NS_LITERAL_STRING("\n<meta http-equiv=\"Context-Type\" content=\"text/html; charset=")
        /* Danger: breaking the line within the string literal, like
           "foo"\n"bar", breaks win32! */
        + aCharset + NS_LITERAL_STRING("\">\n"));
  return NS_OK;
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenHead(const nsIParserNode& aNode)
{
  return OpenContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseHead(const nsIParserNode& aNode)
{
  return CloseContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenBody(const nsIParserNode& aNode)
{
  return OpenContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseBody(const nsIParserNode& aNode)
{
  return CloseContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenForm(const nsIParserNode& aNode)
{
  return OpenContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseForm(const nsIParserNode& aNode)
{
  return CloseContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenMap(const nsIParserNode& aNode)
{
  return OpenContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseMap(const nsIParserNode& aNode)
{
  return CloseContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::OpenFrameset(const nsIParserNode& aNode)
{
  return OpenContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::CloseFrameset(const nsIParserNode& aNode)
{
  return CloseContainer(aNode);
}

NS_IMETHODIMP
mozSanitizingHTMLSerializer::DoFragment(PRBool aFlag)
{
  return NS_OK;
}


// Here comes the actual code...

nsresult
mozSanitizingHTMLSerializer::DoOpenContainer(PRInt32 aTag)
{
  eHTMLTags type = (eHTMLTags)aTag;

  if (IsAllowedTag(type))
  {
    nsCOMPtr<nsIParserService> parserService;
    nsresult rv = GetParserService(getter_AddRefs(parserService));
    if (NS_FAILED(rv))
      return rv;
    const PRUnichar* tag_name;
    parserService->HTMLIdToStringTag(aTag, &tag_name);
    NS_ENSURE_TRUE(tag_name, NS_ERROR_INVALID_POINTER);

    Write(NS_LITERAL_STRING("<") + nsDependentString(tag_name));

    // Attributes
    if (mParserNode)
    {
      PRInt32 count = mParserNode->GetAttributeCount();
      for (PRInt32 i = 0; i < count; i++)
      {
        const nsAString& key = mParserNode->GetKeyAt(i);
        if(IsAllowedAttribute(type, key))
        {
          // Ensure basic sanity of value
          nsAutoString value(mParserNode->GetValueAt(i));
                    // SanitizeAttrValue() modifies |value|
          if (NS_SUCCEEDED(SanitizeAttrValue(type, key, value)))
          {
            // Write out
            Write(NS_LITERAL_STRING(" "));
            Write(key); // I get an infinive loop with | + key + | !!!
            Write(NS_LITERAL_STRING("=\"") + value + NS_LITERAL_STRING("\""));
          }
        }
      }
    }

    Write(NS_LITERAL_STRING(">"));
  }
  else
    Write(NS_LITERAL_STRING(" "));

  return NS_OK;

}

nsresult
mozSanitizingHTMLSerializer::DoCloseContainer(PRInt32 aTag)
{
  eHTMLTags type = (eHTMLTags)aTag;

  if (IsAllowedTag(type)) {
    nsCOMPtr<nsIParserService> parserService;
    nsresult rv = GetParserService(getter_AddRefs(parserService));
    if (NS_FAILED(rv))
      return rv;
    const PRUnichar* tag_name;
    parserService->HTMLIdToStringTag(aTag, &tag_name);
    NS_ENSURE_TRUE(tag_name, NS_ERROR_INVALID_POINTER);

    Write(NS_LITERAL_STRING("</") + nsDependentString(tag_name)
          + NS_LITERAL_STRING(">"));
  }
  else
    Write(NS_LITERAL_STRING(" "));

  return NS_OK;
}

nsresult
mozSanitizingHTMLSerializer::DoAddLeaf(PRInt32 aTag,
                                       const nsAString& aText)
{
  eHTMLTags type = (eHTMLTags)aTag;

  nsresult rv = NS_OK;

  if (type == eHTMLTag_whitespace ||
      type == eHTMLTag_newline)
  {
    Write(aText);
  }
  else if (type == eHTMLTag_text)
  {
    nsAutoString text(aText);
    if(NS_SUCCEEDED(SanitizeTextNode(text)))
      Write(text);
    else
      Write(NS_LITERAL_STRING(TEXT_REMOVED)); // Does not happen (yet)
    if (NS_FAILED(rv))
      return rv;
  }
  else if (type == eHTMLTag_entity)
  {
    Write(NS_LITERAL_STRING("&"));
    Write(aText);
    // using + operator here might give an infinitive loop, see above.
  }
  else
  {
    DoOpenContainer(type);
  }

  return rv;
}


/**
   Similar to SanitizeAttrValue.
 */
nsresult
mozSanitizingHTMLSerializer::SanitizeTextNode(nsString& aText /*inout*/)
{
  nsString& text = aText; // alias

  PRInt32 pos;
  if ((pos = text.Find("base64")) != kNotFound)
    // Probably useless, not sure, but perfhit
    //    return NS_ERROR_ILLEGAL_VALUE; -- this is too extreme
    text.Insert(NS_LITERAL_STRING(TEXT_BREAKER), pos + 1);
         /* Insert some other text after the first char of the problematic
            text, so we prevent the processing by Gecko.
            No idea, if that is needed, but better do it than being sorry.
            Somebody who knows all the dangers and how they are reflected
            in Gecko please jump in. */

  return NS_OK;
}

/**
   Ensures basic sanity of attribute value.
   This function also (tries to :-( ) makes sure, that no
   unwanted / dangerous URLs appear in the document
   (like javascript: and data:).

   Pass the value as |value| arg. It will be modified in-place.

   If the value is not allowed at all, we return with NS_ERROR_ILLEGAL_VALUE.
   In that case, do not use the |value|, but output nothing.
 */
nsresult
mozSanitizingHTMLSerializer::SanitizeAttrValue(nsHTMLTag aTag,
                                               const nsAString& attr_name,
                                               nsString& aValue /*inout*/)
{
  /* First, cut the attribute to 1000 chars.
     Attributes with values longer than 1000 chars seem bogus,
     considering that we don't support any JS. The longest attributes
     I can think of are URLs, and URLs with 1000 chars are likely to be
     bogus, too. */
  nsAutoString value(Substring(aValue, 0, 1000));
  //value.Truncate(1000); //-- this cuts half of the document !!?!!

#ifdef DEBUG_BenB
          printf("7: %s\n", ToNewUTF8String(value));
#endif

  value.StripChars("\"'");  /* This will break javascript attributes,
                               but who wants javascript in
                               sanitized HTML? */

  /* Check some known bad stuff. Add more!
     I don't care too much, if it happens to trigger in some innocent cases
     (like <img alt="Statistical data: Mortage rates and newspapers">) -
     security first. */
  if (value.Find("javascript:") != kNotFound ||
      value.Find("data:") != kNotFound ||
      value.Find("base64") != kNotFound)
    return NS_ERROR_ILLEGAL_VALUE;

  return NS_OK;
}

/**
 */
PRBool
mozSanitizingHTMLSerializer::IsAllowedTag(nsHTMLTag aTag)
{

  nsPRUint32Key tag_key(aTag);
#ifdef DEBUG_BenB
  printf("IsAllowedTag %d: %s\n",
         aTag,
         mAllowedTags.Exists(&tag_key)?"yes":"no");
#endif
  return mAllowedTags.Exists(&tag_key);
}


/**
 */
PRBool
mozSanitizingHTMLSerializer::IsAllowedAttribute(nsHTMLTag aTag,
                                             const nsAString& anAttributeName)
{
#ifdef DEBUG_BenB
  printf("IsAllowedAttribute %d, -%s-\n",
         aTag,
         NS_LossyConvertUCS2toASCII(anAttributeName).get());
#endif
  nsresult rv;

  nsPRUint32Key tag_key(aTag);
  nsIProperties* attr_bag = (nsIProperties*)mAllowedTags.Get(&tag_key);
  NS_ENSURE_TRUE(attr_bag, PR_FALSE);

  PRBool allowed;
  nsAutoString attr(anAttributeName);
  ToLowerCase(attr);
  rv = attr_bag->Has(NS_LossyConvertUCS2toASCII(attr).get(),
                     &allowed);
  if (NS_FAILED(rv))
    return PR_FALSE;

#ifdef DEBUG_BenB
  printf(" Allowed: %s\n", allowed?"yes":"no");
#endif
  return allowed;
}


/**
   aPref is a long string, which holds an exhaustive list of allowed tags
   and attributes. All other tags and attributes will be removed.

   aPref has the format
   "html head body ul ol li a(href,name,title) img(src,alt,title) #text"
   i.e.
   - tags are separated by whitespace
   - the attribute list follows the tag directly in brackets
   - the attributes are separated by commas.

   There is no way to express further restrictions, like "no text inside the
   <head> element. This is so to considerably reduce the complexity of the
   pref and this implementation.

   Update: Akk told me that I might be able to use DTD classes. Later(TM)...
 */
nsresult
mozSanitizingHTMLSerializer::ParsePrefs(const nsAString& aPref)
{
  nsCOMPtr<nsIParserService> parserService;
  nsresult rv = GetParserService(getter_AddRefs(parserService));
  if (NS_FAILED(rv))
    return rv;

  char* pref = ToNewCString(aPref);
#ifdef DEBUG_BenB
  printf("pref: -%s-\n", pref);
#endif
  char* tags_lasts;
  for (char* iTag = PL_strtok_r(pref, " ", &tags_lasts);
       iTag;
       iTag = PL_strtok_r(NULL, " ", &tags_lasts))
  {
    ParseTagPref(nsCAutoString(iTag));
  }
  delete[] pref;

  return NS_OK;
}


/**
   Parses e.g. "a(href,title)" (but not several tags at once).
 */
nsresult
mozSanitizingHTMLSerializer::ParseTagPref(const nsCAutoString& tagpref)
{
  nsCOMPtr<nsIParserService> parserService;
  nsresult rv = GetParserService(getter_AddRefs(parserService));
  if (NS_FAILED(rv))
    return rv;

  // Parsing tag
#ifdef DEBUG_BenB
  printf("Processing tag pref -%s-\n", tagpref.get());
#endif
  PRInt32 bracket = tagpref.Find("(");
  nsCAutoString tag = tagpref;
  if (bracket != kNotFound)
    tag.Truncate(bracket);
#ifdef DEBUG_BenB
  printf( "Tag -%s-\n", tag.get());
#endif
  if (tag.Equals(""))
  {
    printf(" malformed pref: %s\n", tagpref.get());
    return NS_ERROR_CANNOT_CONVERT_DATA;
  }

  // Create key
  NS_ConvertASCIItoUCS2 tag_widestr(tag);
  PRInt32 tag_id;
  parserService->HTMLStringTagToId(tag_widestr, &tag_id);
#ifdef DEBUG_BenB
  printf(" Have tag %d\n", tag_id);
  const PRUnichar* tag_back;
  parserService->HTMLIdToStringTag(tag_id, &tag_back);
  printf(" Equals -%s-\n", tag_back
                           ? NS_ConvertUCS2toUTF8(tag_back).get()
                           : "");
#endif
  if (tag_id == eHTMLTag_userdefined ||
      tag_id == eHTMLTag_unknown)
  {
    printf(" unknown tag <%s>, won't add.\n", tag.get());
    return NS_ERROR_CANNOT_CONVERT_DATA;
  }
  nsPRUint32Key tag_key(tag_id);

  if (mAllowedTags.Exists(&tag_key))
  {
    printf(" duplicate tag: %s\n", tag.get());
    return NS_ERROR_CANNOT_CONVERT_DATA;
  }
  if (bracket == kNotFound)
    /* There are no attributes in the pref. So, allow none; only the tag
       itself */
  {
    mAllowedTags.Put(&tag_key, 0);
  }
  else
  {
    // Attributes

    // where is the macro for non-fatal errors in opt builds?
    if(tagpref[tagpref.Length() - 1] != ')' ||
       tagpref.Length() < PRUint32(bracket) + 3)
    {
      printf(" malformed pref: %s\n", tagpref.get());
      return NS_ERROR_CANNOT_CONVERT_DATA;
    }
    nsCOMPtr<nsIProperties> attr_bag =
                                 do_CreateInstance(NS_PROPERTIES_CONTRACTID);
    NS_ENSURE_TRUE(attr_bag, NS_ERROR_INVALID_POINTER);
    nsCAutoString attrList;
    attrList.Append(Substring(tagpref,
                              bracket + 1,
                              tagpref.Length() - 2 - bracket));
#ifdef DEBUG_BenB
    printf(" Attr list: -%s-\n", attrList.get());
#endif
    char* attrs_lasts;
    for (char* iAttr = PL_strtok_r(NS_CONST_CAST(char*, attrList.get()),
                                   ",", &attrs_lasts);
         iAttr;
         iAttr = PL_strtok_r(NULL, ",", &attrs_lasts))
    {
#ifdef DEBUG_BenB
      printf(" Processing attr -%s-\n", iAttr);
#endif
      attr_bag->Define(iAttr, 0);
    }

    nsIProperties* attr_bag_raw = attr_bag;
    NS_ADDREF(attr_bag_raw);
    mAllowedTags.Put(&tag_key, attr_bag_raw);
  }

  return NS_OK;
}

/*
  might be useful:
  htmlparser/public/nsHTMLTokens.h for tag categories
*/