How to Strip HTML Tags but Keep their Text

Let’s consider the following problem. We have some HTML which comes from user-input. The HTML itself cannot be trusted. We want to remove certain elements from this HTML, but keep the underlying text content so that it can be safe for displaying on our own site. This also means we need to strip any unwanted attributes from certain tags.

This problem isn’t as simple as it appears at first. Because in order to remove specific HTML elements from the DOM tree, but keep the text inside them we would have to recursively search the DOM tree, find the unwated nodes, retain their text nodes, remove them from the tree, and insert the text nodes back into the removed node’s parent.

Walking the DOM Tree (Depth-First In Order Recursion)

To do this in a more straight-forward fashion I’m simply going to implement a generator that extends DOMDocument, which can search the tree using Depth-First In Order Traversal.

<?php
class HTMLFixer extends DOMDocument {
    public function walk(DOMNode $node, $skipParent = false) {
        if (!$skipParent) {
            yield $node;
        }
        if ($node->hasChildNodes()) {
            foreach ($node->childNodes as $n) {
                yield from $this->walk($n);
            }
        }
    }
}

Next we need a way to find the unwanted nodes and remove them without discarding their text. The simplest way to do this is to build a list of tags to be removed from inside the recursion loop and then remove them later using DOMNode::insertBefore() to append the text nodes back up the tree. That way we can remove those nodes later without any side effects.

To do this let’s add a general stripTags() method to our extended HTMLFixer class above.

<?php
public function stripTags(DOMNode $node) {
    $change = $remove = [];

    /* Walk the entire tree to build a list of things that need removed */
    foreach($this->walk($node) as $n) {
        if ($n instanceof DOMText || $n instanceof DOMDocument) {
            continue;
        }
        $this->stripAttributes($n); // strips all node attributes not allowed
        $this->forceAttributes($n); // forces any required attributes
        if (!in_array($n->nodeName, $this->allowedTags, true)) {
            // track the disallowed node for removal
            $remove[] = $n;
            // we take all of its child nodes for modification later
            foreach($n->childNodes as $child) {
                $change[] = [$child, $n];
            }
        }
    }

    /* Go through the list of changes first so we don't break the
       referential integrity of the tree */
    foreach($change as list($a, $b)) {
        $b->parentNode->insertBefore($a, $b);
    }

    /* Now we can safely remove the old nodes */
    foreach($remove as $a) {
        if ($a->parentNode) {
            $a->parentNode->removeChild($a);
        }
    }
}

Deferring the move of the node makes sure we don’t break parentNode reference when the deeper node is the one that’s allowed, but its parent is not in the allowed tags list for example.

Here’s the complete solution.

<?php
class HTMLFixer extends DOMDocument {

    protected static $defaultAllowedTags = [
        'p',
        'h1',
        'h2',
        'h3',
        'h4',
        'h5',
        'h6',
        'pre',
        'code',
        'blockquote',
        'q',
        'strong',
        'em',
        'del',
        'img',
        'a',
        'table',
        'thead',
        'tbody',
        'tfoot',
        'tr',
        'th',
        'td',
        'ul',
        'ol',
        'li',
    ];
    protected static $defaultAllowedAttributes = [
        'a'   => ['href'],
        'img' => ['src'],
        'pre' => ['class'],
    ];
    protected static $defaultForceAttributes = [
        'a' => ['target' => '_blank'],
    ];

    protected $allowedTags       = [];
    protected $allowedAttributes = [];
    protected $forceAttributes   = [];

    public function __construct($version = null, $encoding = null, $allowedTags = [],
                                $allowedAttributes = [], $forceAttributes = []) {
        $this->setAllowedTags($allowedTags ?: static::$defaultAllowedTags);
        $this->setAllowedAttributes($allowedAttributes ?: static::$defaultAllowedAttributes);
        $this->setForceAttributes($forceAttributes ?: static::$defaultForceAttributes);
        parent::__construct($version, $encoding);
    }

    public function setAllowedTags(Array $tags) {
        $this->allowedTags = $tags;
    }

    public function setAllowedAttributes(Array $attributes) {
        $this->allowedAttributes = $attributes;
    }

    public function setForceAttributes(Array $attributes) {
        $this->forceAttributes = $attributes;
    }

    public function getAllowedTags() {
        return $this->allowedTags;
    }

    public function getAllowedAttributes() {
        return $this->allowedAttributes;
    }

    public function getForceAttributes() {
        return $this->forceAttributes;
    }

    public function saveHTML(DOMNode $node = null) {
        if (!$node) {
            $node = $this;
        }
        $this->stripTags($node);
        return parent::saveHTML($node);
    }

    protected function stripTags(DOMNode $node) {
        $change = $remove = [];
        foreach($this->walk($node) as $n) {
            if ($n instanceof DOMText || $n instanceof DOMDocument) {
                continue;
            }
            $this->stripAttributes($n);
            $this->forceAttributes($n);
            if (!in_array($n->nodeName, $this->allowedTags, true)) {
                $remove[] = $n;
                foreach($n->childNodes as $child) {
                    $change[] = [$child, $n];
                }
            }
        }
        foreach($change as list($a, $b)) {
            $b->parentNode->insertBefore($a, $b);
        }
        foreach($remove as $a) {
            if ($a->parentNode) {
                $a->parentNode->removeChild($a);
            }
        }
    }

    protected function stripAttributes(DOMNode $node) {
        $attributes = $node->attributes;
        $len = $attributes->length;
        for ($i = $len - 1; $i >= 0; $i--) {
            $attr = $attributes->item($i);
            if (!isset($this->allowedAttributes[$node->nodeName]) ||
                !in_array($attr->name, $this->allowedAttributes[$node->nodeName], true)) {
                $node->removeAttributeNode($attr);
            }
        }
    }

    protected function forceAttributes(DOMNode $node) {
        if (isset($this->forceAttributes[$node->nodeName])) {
            foreach ($this->forceAttributes[$node->nodeName] as $attribute => $value) {
                $node->setAttribute($attribute, $value);
            }
        }
    }

    protected function walk(DOMNode $node, $skipParent = false) {
        if (!$skipParent) {
            yield $node;
        }
        if ($node->hasChildNodes()) {
            foreach ($node->childNodes as $n) {
                yield from $this->walk($n);
            }
        }
    }

}