The Node.js cluster module is a common method of scaling servers, allowing for the use of all available CPU cores. However, what happens when you must scale to multiple servers or virtual machines?
That is the problem we faced when scaling our newest HTML5 MMORPG. Rather than trying to cluster on a single machine, we wanted to get the benefit of a truly distributed system that can automatically failover and spread the load across multiple servers and even data-centers.
We went through several iterations before deciding on the setup outlined below. We'll be running through a basic example app, which is more general-purpose than some of the custom systems we built for massively multiplayer gaming, but the same concepts apply.
We began by defining the structure of our new stack before selecting the specific pieces -- that is secondary. As you can see in the diagram, we've broken our app into 3 different layers: load balancer on the front, Node app in the middle and a system to relay messages to connect it all together.
If you've had any experience setting up distributed systems, then you are probably used to a similar pattern. Things get interesting when you start to look at the different pieces involved with this type of stack.
With our structure in place, we now need to fill in those pieces, starting with the load balancer. There are a lot of great load balancers out there that we considered: Nginx, HAProxy, node-http-proxy, etc. These are all very capable and are all in wide use today.
Ultimately, we selected node-http-proxy because we felt most comfortable customizing it for our needs (our stack is essentially 100% Javascript) while maintaining a sufficient level of performance. It should be noted that at the time, many of the necessary features in HAProxy were still in beta, but they have recently gone stable in v1.5. We also felt that Nginx was a bit overkill since we only needed to proxy WebSockets (everything else is served through a CDN, which we benchmarked CloudFlare vs CloudFront previously).
What's cool about node-http-proxy is instead of creating a configuration file, it's just another Node app, meaning you'll feel right at home. Here's a stripped down version of our proxy server:
var http = require('https'); | |
var fs = require('fs'); | |
var proxy = require('http-proxy'); | |
http.globalAgent.maxSockets = 10240; | |
// Define the servers to load balance. | |
var servers = [ | |
{host: 'SERVER1-IP', port: 80}, | |
{host: 'SERVER2-IP', port: 80}, | |
{host: 'SERVER3-IP', port: 80} | |
]; | |
// load the SSL cert | |
var ca = [ | |
fs.readFileSync('./certs/PositiveSSLCA2.crt'), | |
fs.readFileSync('./certs/AddTrustExternalCARoot.crt') | |
]; | |
var opts = { | |
ca: ca, | |
key: fs.readFileSync('./certs/example_wild.key'), | |
cert: fs.readFileSync('./certs/STAR_example_com.crt') | |
}; | |
// Create a proxy object for each target. | |
var proxies = servers.map(function (target) { | |
return new proxy.createProxyServer({ | |
target: target, | |
ws: true, | |
xfwd: true, | |
ssl: opts | |
}); | |
}); | |
/** | |
* Select a random server to proxy to. | |
* @return {Number} Index of the proxy to use. | |
*/ | |
var selectServer = function(req, res) { | |
return Math.floor(Math.random() * proxies.length); | |
}; | |
// Select the next server and send the http request. | |
var serverCallback = function(req, res) { | |
var proxyIndex = selectServer(); | |
var proxy = proxies[proxyIndex]; | |
proxy.web(req, res); | |
proxy.on('error', function(err) { | |
startFailoverTimer(proxyIndex); | |
}); | |
}; | |
var server = http.createServer(opts, serverCallback); | |
// Get the next server and send the upgrade request. | |
server.on('upgrade', function(req, socket, head) { | |
var proxyIndex = selectServer(); | |
var proxy = proxies[proxyIndex]; | |
proxy.ws(req, socket, head); | |
proxy.on('error', function(err, req, socket) { | |
socket.end(); | |
startFailoverTimer(proxyIndex); | |
}); | |
}); | |
server.listen(443); |
var http = require('https'); | |
var fs = require('fs'); | |
var proxy = require('http-proxy'); | |
http.globalAgent.maxSockets = 10240; | |
// Define the servers to load balance. | |
var servers = [ | |
{host: 'SERVER1-IP', port: 80}, | |
{host: 'SERVER2-IP', port: 80}, | |
{host: 'SERVER3-IP', port: 80} | |
]; | |
// load the SSL cert | |
var ca = [ | |
fs.readFileSync('./certs/PositiveSSLCA2.crt'), | |
fs.readFileSync('./certs/AddTrustExternalCARoot.crt') | |
]; | |
var opts = { | |
ca: ca, | |
key: fs.readFileSync('./certs/example_wild.key'), | |
cert: fs.readFileSync('./certs/STAR_example_com.crt') | |
}; | |
// Create a proxy object for each target. | |
var proxies = servers.map(function (target) { | |
return new proxy.createProxyServer({ | |
target: target, | |
ws: true, | |
xfwd: true, | |
ssl: opts | |
}); | |
}); | |
/** | |
* Select a random server to proxy to. | |
* @return {Number} Index of the proxy to use. | |
*/ | |
var selectServer = function(req, res) { | |
return Math.floor(Math.random() * proxies.length); | |
}; | |
// Select the next server and send the http request. | |
var serverCallback = function(req, res) { | |
var proxyIndex = selectServer(); | |
var proxy = proxies[proxyIndex]; | |
proxy.web(req, res); | |
proxy.on('error', function(err) { | |
startFailoverTimer(proxyIndex); | |
}); | |
}; | |
var server = http.createServer(opts, serverCallback); | |
// Get the next server and send the upgrade request. | |
server.on('upgrade', function(req, socket, head) { | |
var proxyIndex = selectServer(); | |
var proxy = proxies[proxyIndex]; | |
proxy.ws(req, socket, head); | |
proxy.on('error', function(err, req, socket) { | |
socket.end(); | |
startFailoverTimer(proxyIndex); | |
}); | |
}); | |
server.listen(443); |
All this does is accept 'upgrade' and general 'http' requests ('http' requests are needed for SockJS fallbacks), and then proxies the requests to one of the Node servers at random using selectServer and proxy.web()/proxy.ws().
I also included the code to get up and running with SSL since this is a common pain point. I strongly recommend using SSL on the load balancer as you'll generally get better WebSocket connections, but if you don't need it you can slim the code down even further.
We now have a great little load balancer, but we wanted to take it further and add a little more customization with sticky sessions and automatic failover. If one of the servers were to go down for an extended period of time, we need to be able to migrate current and new requests to the other servers. So, we just made a few simple updates:
var http = require('https'); | |
var fs = require('fs'); | |
var proxy = require('http-proxy'); | |
var request = require('request'); | |
http.globalAgent.maxSockets = 10240; | |
// Define the servers to load balance. | |
var servers = [ | |
{host: 'SERVER1-IP', port: 80}, | |
{host: 'SERVER2-IP', port: 80}, | |
{host: 'SERVER3-IP', port: 80} | |
]; | |
var failoverTimer = []; | |
// load the SSL cert | |
var ca = [ | |
fs.readFileSync('./certs/PositiveSSLCA2.crt'), | |
fs.readFileSync('./certs/AddTrustExternalCARoot.crt') | |
]; | |
var opts = { | |
ca: ca, | |
key: fs.readFileSync('./certs/example_wild.key'), | |
cert: fs.readFileSync('./certs/STAR_example_com.crt') | |
}; | |
// Create a proxy object for each target. | |
var proxies = servers.map(function (target) { | |
return new proxy.createProxyServer({ | |
target: target, | |
ws: true, | |
xfwd: true, | |
ssl: opts, | |
down: false | |
}); | |
}); | |
/** | |
* Select a random server to proxy to. If a 'server' cookie is set, use that | |
* as the sticky session so the user stays on the same server (good for ws fallbacks). | |
* @param {Object} req HTTP request data | |
* @param {Object} res HTTP response | |
* @return {Number} Index of the proxy to use. | |
*/ | |
var selectServer = function(req, res) { | |
var index = -1; | |
var i = 0; | |
// Check if there are any cookies. | |
if (req.headers && req.headers.cookie && req.headers.cookie.length > 1) { | |
var cookies = req.headers.cookie.split('; '); | |
for (i=0; i<cookies.length; i++) { | |
if (cookies[i].indexOf('server=') === 0) { | |
var value = cookies[i].substring(7, cookies[i].length); | |
if (value && value !== '') { | |
index = value; | |
break; | |
} | |
} | |
} | |
} | |
// Select a random server if they don't have a sticky session. | |
if (index < 0 || !proxies[index]) { | |
index = Math.floor(Math.random() * proxies.length); | |
} | |
// If the selected server is down, select one that isn't down. | |
if (proxies[index].options.down) { | |
index = -1; | |
var tries = 0; | |
while (tries < 5 && index < 0) { | |
var randIndex = Math.floor(Math.random() * proxies.length); | |
if (!proxies[randIndex].options.down) { | |
index = randIndex; | |
} | |
tries++; | |
} | |
} | |
index = index >= 0 ? index : 0; | |
// Store the server index as a sticky session. | |
if (res) { | |
res.setHeader('Set-Cookie', 'server=' + index + '; path=/'); | |
} | |
return index; | |
}; | |
/** | |
* Fired when there is an error with a request. | |
* Sets up a 10-second interval to ping the host until it is back online. | |
* There is a 10-second buffer before requests start getting blocked to this host. | |
* @param {Number} index Index in the proxies array. | |
*/ | |
var startFailoverTimer = function(index) { | |
if (failoverTimer[index]) { | |
return; | |
} | |
failoverTimer[index] = setTimeout(function() { | |
// Check if the server is up or not | |
request({ | |
url: 'http://' + proxies[index].options.target.host + ':' + proxies[index].options.target.port, | |
method: 'HEAD', | |
timeout: 10000 | |
}, function(err, res, body) { | |
failoverTimer[index] = null; | |
if (res && res.statusCode === 200) { | |
proxies[index].options.down = false; | |
console.log('Server #' + index + ' is back up.'); | |
} else { | |
proxies[index].options.down = true; | |
startFailoverTimer(index); | |
console.log('Server #' + index + ' is still down.'); | |
} | |
}); | |
}, 10000); | |
}; | |
// Select the next server and send the http request. | |
var serverCallback = function(req, res) { | |
var proxyIndex = selectServer(req, res); | |
var proxy = proxies[proxyIndex]; | |
proxy.web(req, res); | |
proxy.on('error', function(err) { | |
startFailoverTimer(proxyIndex); | |
}); | |
}; | |
var server = http.createServer(opts, serverCallback); | |
// Get the next server and send the upgrade request. | |
server.on('upgrade', function(req, socket, head) { | |
var proxyIndex = selectServer(req); | |
var proxy = proxies[proxyIndex]; | |
proxy.ws(req, socket, head); | |
proxy.on('error', function(err, req, socket) { | |
socket.end(); | |
startFailoverTimer(proxyIndex); | |
}); | |
}); | |
server.listen(443); |
var http = require('https'); | |
var fs = require('fs'); | |
var proxy = require('http-proxy'); | |
var request = require('request'); | |
http.globalAgent.maxSockets = 10240; | |
// Define the servers to load balance. | |
var servers = [ | |
{host: 'SERVER1-IP', port: 80}, | |
{host: 'SERVER2-IP', port: 80}, | |
{host: 'SERVER3-IP', port: 80} | |
]; | |
var failoverTimer = []; | |
// load the SSL cert | |
var ca = [ | |
fs.readFileSync('./certs/PositiveSSLCA2.crt'), | |
fs.readFileSync('./certs/AddTrustExternalCARoot.crt') | |
]; | |
var opts = { | |
ca: ca, | |
key: fs.readFileSync('./certs/example_wild.key'), | |
cert: fs.readFileSync('./certs/STAR_example_com.crt') | |
}; | |
// Create a proxy object for each target. | |
var proxies = servers.map(function (target) { | |
return new proxy.createProxyServer({ | |
target: target, | |
ws: true, | |
xfwd: true, | |
ssl: opts, | |
down: false | |
}); | |
}); | |
/** | |
* Select a random server to proxy to. If a 'server' cookie is set, use that | |
* as the sticky session so the user stays on the same server (good for ws fallbacks). | |
* @param {Object} req HTTP request data | |
* @param {Object} res HTTP response | |
* @return {Number} Index of the proxy to use. | |
*/ | |
var selectServer = function(req, res) { | |
var index = -1; | |
var i = 0; | |
// Check if there are any cookies. | |
if (req.headers && req.headers.cookie && req.headers.cookie.length > 1) { | |
var cookies = req.headers.cookie.split('; '); | |
for (i=0; i<cookies.length; i++) { | |
if (cookies[i].indexOf('server=') === 0) { | |
var value = cookies[i].substring(7, cookies[i].length); | |
if (value && value !== '') { | |
index = value; | |
break; | |
} | |
} | |
} | |
} | |
// Select a random server if they don't have a sticky session. | |
if (index < 0 || !proxies[index]) { | |
index = Math.floor(Math.random() * proxies.length); | |
} | |
// If the selected server is down, select one that isn't down. | |
if (proxies[index].options.down) { | |
index = -1; | |
var tries = 0; | |
while (tries < 5 && index < 0) { | |
var randIndex = Math.floor(Math.random() * proxies.length); | |
if (!proxies[randIndex].options.down) { | |
index = randIndex; | |
} | |
tries++; | |
} | |
} | |
index = index >= 0 ? index : 0; | |
// Store the server index as a sticky session. | |
if (res) { | |
res.setHeader('Set-Cookie', 'server=' + index + '; path=/'); | |
} | |
return index; | |
}; | |
/** | |
* Fired when there is an error with a request. | |
* Sets up a 10-second interval to ping the host until it is back online. | |
* There is a 10-second buffer before requests start getting blocked to this host. | |
* @param {Number} index Index in the proxies array. | |
*/ | |
var startFailoverTimer = function(index) { | |
if (failoverTimer[index]) { | |
return; | |
} | |
failoverTimer[index] = setTimeout(function() { | |
// Check if the server is up or not | |
request({ | |
url: 'http://' + proxies[index].options.target.host + ':' + proxies[index].options.target.port, | |
method: 'HEAD', | |
timeout: 10000 | |
}, function(err, res, body) { | |
failoverTimer[index] = null; | |
if (res && res.statusCode === 200) { | |
proxies[index].options.down = false; | |
console.log('Server #' + index + ' is back up.'); | |
} else { | |
proxies[index].options.down = true; | |
startFailoverTimer(index); | |
console.log('Server #' + index + ' is still down.'); | |
} | |
}); | |
}, 10000); | |
}; | |
// Select the next server and send the http request. | |
var serverCallback = function(req, res) { | |
var proxyIndex = selectServer(req, res); | |
var proxy = proxies[proxyIndex]; | |
proxy.web(req, res); | |
proxy.on('error', function(err) { | |
startFailoverTimer(proxyIndex); | |
}); | |
}; | |
var server = http.createServer(opts, serverCallback); | |
// Get the next server and send the upgrade request. | |
server.on('upgrade', function(req, socket, head) { | |
var proxyIndex = selectServer(req); | |
var proxy = proxies[proxyIndex]; | |
proxy.ws(req, socket, head); | |
proxy.on('error', function(err, req, socket) { | |
socket.end(); | |
startFailoverTimer(proxyIndex); | |
}); | |
}); | |
server.listen(443); |
In order to implement sticky sessions, we simply updated our selectServer method to read the user's cookies and go to the specified server if one is set. Otherwise, we'd continue as normal and set the 'server' cookie to the key of the server we intend to route that user to. This isn't always necessary, but when using something like SockJS, this prevents your users from bouncing between servers if they fallback to polling.
In addition to sticky sessions, we created a very basic automatic failover system by keeping track of which servers are down. When a server errors, we begin an interval to check the server with the request module. As long as it returns anything but a status code of 200, we assume the server is still down. Once it comes back online, we mark it as such and begin accepting connections again. However, to avoid false alarms, it is a good idea to give a 10-second or so buffer before marking a server as `down.`
We can now route our requests evenly across multiple instances of our Node app, so now we need something to balance the load to. The focus of this post isn't on the app layer, as everyone's will be drastically different, so we will just setup a simple relay server to pass WebSocket messages between clients. We like to use SockJS, but this setup will work just as well with socket.io, ws or any other implementation.
First, we'll create our client to connect and receive messages:
// Create our connection to the server. | |
var sock = new SockJS('LOAD-BALANCER-IP/sockjs'); | |
// Listen for incoming messages and log them. | |
sock.onmessage(function(e){ | |
console.log('Message:', e.data); | |
}); |
// Create our connection to the server. | |
var sock = new SockJS('LOAD-BALANCER-IP/sockjs'); | |
// Listen for incoming messages and log them. | |
sock.onmessage(function(e){ | |
console.log('Message:', e.data); | |
}); |
Next, we'll setup our simple Node app to relay those messages to other clients:
var http = require('http'); | |
var sockjs = require('sockjs'); | |
// Setup our SockJS server. | |
var clients = []; | |
var echo = sockjs.createServer(); | |
echo.on('connection', function(conn) { | |
// Add this client to the client list. | |
clients.push(conn); | |
// Listen for data coming from clients. | |
conn.on('data', function(message) { | |
// Broadcast the message to all connected clients. | |
for (var i=0; i<clients.length; i++) { | |
clients[i].write(message); | |
} | |
}); | |
// Remove the client from the list. | |
conn.on('close', function() { | |
clients.splice(clients.indexOf(conn), 1); | |
}); | |
}); | |
// Begin listening. | |
var server = http.createServer(); | |
echo.installHandlers(server, {prefix: '/sockjs'}); | |
server.listen(80); |
var http = require('http'); | |
var sockjs = require('sockjs'); | |
// Setup our SockJS server. | |
var clients = []; | |
var echo = sockjs.createServer(); | |
echo.on('connection', function(conn) { | |
// Add this client to the client list. | |
clients.push(conn); | |
// Listen for data coming from clients. | |
conn.on('data', function(message) { | |
// Broadcast the message to all connected clients. | |
for (var i=0; i<clients.length; i++) { | |
clients[i].write(message); | |
} | |
}); | |
// Remove the client from the list. | |
conn.on('close', function() { | |
clients.splice(clients.indexOf(conn), 1); | |
}); | |
}); | |
// Begin listening. | |
var server = http.createServer(); | |
echo.installHandlers(server, {prefix: '/sockjs'}); | |
server.listen(80); |
While extremely simplistic, it should work well for a single server setup. However, the problem we run into is that our list of clients is sitting in memory in this Node process. This is where the messaging layer steps in to tie it all together.
Just like in the case of load balancers, there are several great messaging solutions. We tested a few such as RabbitMQ and ZeroMQ, but nothing beat the simplicity and raw speed of Redis (not to mention it can double as a session store, etc). Depending on your use-case, any of these could be great options.
Redis is so easy to get up and running that I'm not going to waste your time rehashing it. Simply head over to redis.io and check out the documentation. In our simple Node app we are going to use Redis Pub/Sub to broadcast our WebSocket messages to all of our load-balanced servers. This will allow all of our clients to get all messages, even if the original message came from a different server.
var http = require('http'); | |
var sockjs = require('sockjs'); | |
var redis = require('redis'); | |
// Setup Redis pub/sub. | |
// NOTE: You must create two Redis clients, as | |
// the one that subscribes can't also publish. | |
var pub = redis.createClient(); | |
var sub = redis.createClient(); | |
sub.subscribe('global'); | |
// Listen for messages being published to this server. | |
sub.on('message', function(channel, msg) { | |
// Broadcast the message to all connected clients on this server. | |
for (var i=0; i<clients.length; i++) { | |
clients[i].write(msg); | |
} | |
}); | |
// Setup our SockJS server. | |
var clients = []; | |
var echo = sockjs.createServer(); | |
echo.on('connection', function(conn) { | |
// Add this client to the client list. | |
clients.push(conn); | |
// Listen for data coming from clients. | |
conn.on('data', function(message) { | |
// Publish this message to the Redis pub/sub. | |
pub.publish('global', message); | |
}); | |
// Remove the client from the list. | |
conn.on('close', function() { | |
clients.splice(clients.indexOf(conn), 1); | |
}); | |
}); | |
// Begin listening. | |
var server = http.createServer(); | |
echo.installHandlers(server, {prefix: '/sockjs'}); | |
server.listen(80); |
var http = require('http'); | |
var sockjs = require('sockjs'); | |
var redis = require('redis'); | |
// Setup Redis pub/sub. | |
// NOTE: You must create two Redis clients, as | |
// the one that subscribes can't also publish. | |
var pub = redis.createClient(); | |
var sub = redis.createClient(); | |
sub.subscribe('global'); | |
// Listen for messages being published to this server. | |
sub.on('message', function(channel, msg) { | |
// Broadcast the message to all connected clients on this server. | |
for (var i=0; i<clients.length; i++) { | |
clients[i].write(msg); | |
} | |
}); | |
// Setup our SockJS server. | |
var clients = []; | |
var echo = sockjs.createServer(); | |
echo.on('connection', function(conn) { | |
// Add this client to the client list. | |
clients.push(conn); | |
// Listen for data coming from clients. | |
conn.on('data', function(message) { | |
// Publish this message to the Redis pub/sub. | |
pub.publish('global', message); | |
}); | |
// Remove the client from the list. | |
conn.on('close', function() { | |
clients.splice(clients.indexOf(conn), 1); | |
}); | |
}); | |
// Begin listening. | |
var server = http.createServer(); | |
echo.installHandlers(server, {prefix: '/sockjs'}); | |
server.listen(80); |
The joy of Redis is just how simple it is to do powerful things like Pub/Sub (and virtually anything else for that matter). In just a few lines of code, we've subscribed to the 'global' channel and are publishing the messages we receive to it. Redis then handles the rest, and our (admittedly useless) app is now able to scale across multiple servers! Once each server subscribing to the `global` channel receives the data, it then broadcasts to its own list of connected clients and to the user looks like a single server.
This is clearly a naive solution that makes many generalizations, but this was essentially our starting point before evolving it into a fully scalable MMO backend. For example, in a real-world situation you probably wouldn't want to broadcast every message to every client. In this case, we keep track of our client list and what server they are connected to in Redis. We can then intelligently publish messages to the correct servers and dramatically cut down on bandwidth, throughput and latency.
The great thing is that Node and Redis make it very easy to get something simple up and running, but still provide the power to dive deeper and build more complex systems. Happy scaling, and we'd love to hear how others have tackled these problems in the comments! And, if you'd like to see this running live, be sure to check out CasinoRPG and let us know what you think.