Copying Databases in Node

Yes, there are tons of great, StackOverflow-accepted ways to copy giant databases betweens servers. But, sometimes you have a special need for a project that requires selectively copying subsets of data instead of an entire 15GB database.

So, when you have to programmatically copy rows between databases in Node…


Problems with LIMIT

When I first attempted this, I ran into several problems:

  • I didn’t use a LIMIT, and ran out of memory.
  • I specified a LIMIT, but the bulk INSERT failed for some reason.
  • I specified a lower LIMIT, but the INSERT performed differently on each table.
  • I specified an arbitrarily low LIMIT, but on simple tables, it took longer.

I realized there’s no perfect way of doing this, and I surely didn’t want to insert a single row at a time.

Group INSERTs by Size, not Rows

First, let’s use the following tools:

The premise is to copy N rows that roughly equate to M amount of memory.

1. Setup dependencies

var knex = require('knex');
var Promise = require('bluebird');
var ProgressBar = require('progress');

var local = knex.initialize(...);
var remote = knex.initialize(...);

2. Get table stats

var getStats = function(table) {
  // Query total number of rows (`information_schema` is just an estimate)
  return remote(table).count('_').then(function(response) {
    stats.totalRows = response.shift()['count(_)'];
  })
  // Request DB estimates for table size (approximate)
  .then(function() {
    return remote('information*schema.TABLES').select('data_length', 'index_length').where({
      table_name: table,
      table_schema: this.remote.client.connectionSettings.database,
    });
  })
  // Calculate stats for # of rows & iterations to stay within memory limits
  .then(function(result) {
    // Base variables for insertion chunk calculations
    stats.memoryLimit = 25 * 1024 * 1024; // 25Mb
    stats.dataLength = result[0]['data_length'];
    stats.indexLength = result[0]['index_length'];
    stats.totalLength = stats.dataLength + stats.indexLength;
    stats.rowLimit = Math.min(stats.totalRows, Math.ceil(stats.totalRows * stats.memoryLimit / stats.totalLength));

    return stats;
  }})
};

3. Copy table in groups

Now that we have a lot of stats based around a 25Mb memory limit, now we can copy the table:

var copyTable = function (table) {
  return getTableStats(table).then(function (stats) {
    // Empty array to make it easy to iterate for insertion group
    var iterations = new Array(Math.ceil(stats.totalRows / stats.rowLimit))

    // Promise.reduce ensures iterations are done sequentially, whereas
    // Promise.all/map runs in parallel
    return Promise.reduce(
      iterations,
      function (previous, iteration, i) {
        var rowOffset = stats.rowLimit * i

        return copyRows(table, rowOffset, stats.rowLimit, stats.totalRows)
      },
      []
    )
  })
}

4. Copy individual rows

All that our copyTable function is missing is copyRows:

var copyRows = function (table, offset, limit, total) {
  return remote(table)
    .select('*')
    .offset(offset)
    .limit(limit)
    .then(function (rows) {
      // "[DATABASE] Inserting records N-M of O into TABLE"
      var msg = [
        '[' + chalk.yellow(local.client.connectionSettings.database) + ']',
        'Inserting records',
        chalk.green(offset + 1) + '-' + chalk.green(offset + rows.length),
        'of',
        chalk.green(total),
        'into',
        chalk.yellow(table),
        '[:bar] :percent :etas',
      ].join(' ')

      var bar = new Progress(msg, {
        total: rows.length,
        width: 20,
      })

      // Insert each record individually because it's actually faster (!?)
      // and gives us a pretty progress bar.
      return Promise.map(rows, function (row) {
        return local(table)
          .insert(row)
          .then(function () {
            bar.tick()
          })
      })
    })
}

5. Finish

Finally, kick off the whole promise chain:

copyTable('something_huge').then(
  function () {
    console.log('Success!')
    process.exit(0)
  },
  function (err) {
    console.error(err)
    process.exit(1)
  }
)

demo