Drupal 7 migrations from HTML pages

By Joel Stein on February 10, 2014

I often use the excellent Migrate suite of modules to import data into Drupal. It’s great for getting any type of content into a Drupal node, file, taxonomy term, user, and more. My latest task: importing content from raw HTML into a Drupal node. Not finding any pre-built classes in Migrate to handle this, I built a few simple classes myself.

I’m assuming you already know how to work with Migrate. First, add these classes to your codebase:

/**
 * Pass an array of URLs to the constructor.
 */
class MigrateListHtml extends MigrateList {

  protected $urls;

  public function __construct($urls) {
    parent::__construct();
    $this->urls = $urls;
  }

  public function __toString() {
    if (empty($this->urls)) {
      return '';
    }
    return t('!count URLs (such as !first_url)', array(
      '!count' => count($this->urls),
      '!first_url' => $this->urls[0],
    ));
  }

  public function getIdList() {
    return $this->urls;
  }

  public function computeCount() {
    return count($this->urls);
  }

}

/**
 * Processes each URL.
 */
class MigrateItemHtml extends MigrateItem {

  public function getItem($url) {
    $return = new stdClass;
    $return->url = $url;

    // Get HTML.
    $return->html = file_get_contents($url);

    // Load HTML and setup XPath.
    $return->dom = new DOMDocument();
    @$return->dom->loadHtml($return->html);
    $return->xpath = new DOMXPath($return->dom);

    return $return;
  }

}

Then, you define your migration class. You’ll need to tell MigrateSourceList which fields you’ll make available in your prepareRow function (in my example, that’s “url”, “title” and “body”). You’ll have an XPath object in prepareRow() that you can use to parse your HTML.

Here’s what it might look like:

class MyHtmlMigration extends Migration {

  public function __construct(array $arguments) {
    parent::__construct($arguments);

    // Define source. Pass array of URLs to MigrateListHtml, and
    // array of fields to MigrateSourceList.
    $this->source = new MigrateSourceList(new MigrateListHtml(array(
      'http://mywebsite.com/page1.html',
      'http://mywebsite.com/page2.html',
      'http://mywebsite.com/page3.html',
    )), new MigrateItemHtml, array(
      'url' => t('URL'),
      'title' => t('Title'),
      'body' => t('Body'),
    ));

    // Define destination. I'm creating Page nodes.
    $this->destination = new MigrateDestinationNode('page');

    // Define mapping. Maps source URLs to destination node IDs.
    $this->map = new MigrateSQLMap($this->machineName, array(
      'sourceid' => array(
        'type' => 'varchar',
        'length' => 255,
        'not null' => TRUE,
      ),
    ), MigrateDestinationNode::getKeySchema());

    // Field mappings.
    $this->addFieldMapping('title', 'title');
    $this->addFieldMapping('body', 'body');
    $this->addFieldMapping('body:format')->defaultValue('full_html');
  }

  /**
   * $row has "url", "html", "dom", and "xpath" variables.
   * Manually parse $row->html, or use $row->xpath to query HTML.
   */
  public function prepareRow($row) {
    if (parent::prepareRow($row) === FALSE) {
      return FALSE;
    }

    // Get title from first H1 tag.
    $list = $row->xpath->query('//h1');
    $row->title = $list->length > 0 ? $list->item(0)->nodeValue : '';

    // Get content for body (outer html of DIV).
    $list = $row->xpath->query('//div[@id="some-id"]');
    $row->body = '';
    if ($list->length > 0) {
      $node = $list->item(0);
      $row->body = $node->ownerDocument->saveHTML($node);
    }

    // Or, get content using inner html method.
    $list = $row->xpath->query('//body');
    $row->body = '';
    if ($list->length > 0) {
      foreach ($list->item(0)->childNodes as $child) {
        $row->body .= $child->ownerDocument->saveHTML($child);
      }
    }
  }

}

That’s it! You may need to use cURL if file_get_contents() doesn’t give you enough flexibility (such as handling authentication to your URLs).