From 18ac1a8e5aa83b276ea5b9702e5f38ecae4e57e7 Mon Sep 17 00:00:00 2001 From: Haridas Narayanaswamy Date: Tue, 24 Mar 2015 08:28:56 +0530 Subject: [PATCH 1/4] Added New Client with Ketama based Consistent Hashing support. TODO: - Add more documentation - Way to test the client with multiple memcache servers. --- memcache.py | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/memcache.py b/memcache.py index 80770ed..f9f2175 100644 --- a/memcache.py +++ b/memcache.py @@ -1412,6 +1412,200 @@ def __str__(self): return "unix:%s%s" % (self.address, d) +class KetamaClient(Client): + """ Memcach client with Consistent hashing support. + + We are using the ketama algorithm to implement the consistent hashing. + + The ideal use case for this client is when your caching servers are + going to gets added or removed on time and you don't want to hot load + most of the hash keys at each time. If you are using consistent hashing + based client when ever there is change in the number of caching servers, + only very few percentage of cache miss happens across all servers. + By adjusting the SERVER_WEIGHT for your environment you can get least miss + rate. + + How Ketama Works: + Ketama algorithm uses very simple algorithm to achieve the consistent + hashing. What it does is that, + + 1. Preset the total number of Keys that we are going to save on the + hash server. eg; Total of 2 ** 16 or so. + 2. Logically we put all this hash keys on a ring in ascending order. + 3. For each server we give ring slots ( Hash key, or a position on the + ring.) Also we place same server in multiple places of the ring + to get better key distribution on a server. + 4. When we want to place a value on the hash server, we give a slot + for that vale on the ring, and then we find a next closest server + on the ring by searching clock-wise. And then we pick that server + to actually save key:value pair. + 5. Reading time, the same process happens. We find the ring slot for + the given key, and find the next server on the ring by searching + clock-wise. We know that we placed the value for that key on that + server. + 6. So when we add or remove one server, some server slots getting + removed from the ring or some new one gets added. After this update + there is chances that some keys will miss, since we stop searching + for another server on the ring once we get the first server by doing + the clock-wise lookup. But the miss rate will be #Keys / RING_SIZE. + In case of non-consistent hashing method, since the hash function + depends on the number of servers, the majority of the keys misses if + we add or remove server. + + TODO: Improve the documentation, add test cases. + """ + # For this Consistent hashing client, the weight of the server means number + # of times the same server is placed on the different slots of the ketama + # hash key ring. This will make sure the each server have well normalized + # key distribution. + DEFAULT_SERVER_WEIGHT = 200 + + # Total number of slots on the ring. + # If addition or deletion of a new server only causes 1 to 5 percentage + # cache miss on the current configuration. ie; K / RING_SIZE + # where K means total keys stored on the ring. + RING_SIZE = 2 ** 16 + + def __init__(self, *args, **kwargs): + # Mapping between ring slot -> server. + self._ketama_server_ring = {} + + # Sorted server slots on top of the virtual ring. + self._ketama_server_slots = [] + + super(KetamaClient, self).__init__(*args, **kwargs) + + def add_server(self, server): + """ + Add new server to the client. + + @param servers: server host in : format. + or in tuple of (:, weight) + """ + server_obj = memcache._Host( + server if isinstance(server, tuple) else ( + server, self.DEFAULT_SERVER_WEIGHT), + self.debug, dead_retry=self.dead_retry, + socket_timeout=self.socket_timeout, + flush_on_reconnect=self.flush_on_reconnect) + + self._place_server_on_ring(server_obj) + + def _get_server(self, key): + """ + Get the memcache server corresponding to the given key. + + Here we find the first server on the ring by searching clock-wise + from the given ring slot corresponding to the key. + + @param key: The input query. + + @return A tuple with (server_obj, key). + """ + # map the key on to the ring slot space. + h_key = self._generate_ring_slot(key) + + for slot in self._ketama_server_slots: + if h_key <= slot: + server = self._ketama_server_ring[slot] + if server.connect(): + return (server, key) + + # Even after allocating the server, if the h_key won't fit + # on any server, then pick the first server on the ring. + server = (self._ketama_server_ring[self._ketama_server_slots[0]] + if self._ketama_server_slots else None) + + server and server.connect() + return server, key + + def set_servers(self, servers): + """ + Add a pool of servers into the client. + + @param servers: List of server hosts in : format. + or + List of tuples with each tuple of the format + (:, weight) + """ + # Set the default weight if weight isn't passed. + self.servers = [memcache._Host( + s if isinstance(s, tuple) else (s, self.DEFAULT_SERVER_WEIGHT), + self.debug, dead_retry=self.dead_retry, + socket_timeout=self.socket_timeout, + flush_on_reconnect=self.flush_on_reconnect) for s in servers] + + # Place all the servers on rings based on the slot allocation + # specifications. + map(self._place_server_on_ring, self.servers) + + def _place_server_on_ring(self, server): + """ + Place given server on the ring. + + Based on the weight of the server, we generate multiple slots for + one key. This will give better key distribution. + + @param server: An instance of :class:~`memcache._Host`. + """ + server_slots = self._get_server_slots_on_ring(server) + for slot in server_slots: + if slot not in self._ketama_server_ring: + self._ketama_server_ring[slot] = server + self._ketama_server_slots.append(slot) + else: + # There is a key collection(<<<1% chance). + # Discarding this scenario now. + # TODO: Handle it. + pass + + # Sort the server slot keys to make it a ring. + self._ketama_server_slots.sort() + + def _get_server_slots_on_ring(self, server): + """ + Returns list of slot on the ring for given server. + + This make sure that the slots won't collide with others server. + + @param: server An object of :class:~`memcache._Host`. + @return: list of slots on the ring. + """ + server_slots = [] + + for i in range(0, server.weight): + # TODO: Keep a UUID id for each servers to avoid key collision. + server_key = "{}_{}".format("{}:{}".format(server.ip, + server.port), i) + + server_slots.append(self._generate_ring_slot(server_key)) + + return server_slots + + def _generate_ring_slot(self, key): + """ + Hash function which give random slots on the ring. Hash functon make + sure that the key distribution is even as much as possible. + + @param key: Key which need to be mapped to the hash space. + @type key: str + + @return: hash key corresponding to the `key` + """ + #TODO: Make it more general. + + # Simple hash method using python's internal hash algorithm. + #h_key = hash(key) & 0xffff + + # crc32 based hashing + #h_key = ((crc32(key) & 0xffffffff) >> 16) & 0xffff + + # For better randomness + h_key = ((crc32(key) & 0xffffffff)) & 0xffff + + return h_key + + def _doctest(): import doctest import memcache From ed5a2000f5c6cf0cd35f6cefee48653942f35b86 Mon Sep 17 00:00:00 2001 From: Sergio Martins Date: Thu, 26 Mar 2015 12:43:23 -0400 Subject: [PATCH 2/4] Fixed bugs and improved comments. - Used description from Richard's blog (with his permission) where appropriate - Fixed usage of crc32 function - Fixed issue with crc32 in python3 - Fixed _get_server() not handling (serverhash, key) tuple - Fixed usage of _Host class - Removed unused function add_server() - Ran tests using KetamaClass (not included in this commit) - Tested against multiple memcached servers --- memcache.py | 137 +++++++++++++++++----------------------------------- 1 file changed, 43 insertions(+), 94 deletions(-) diff --git a/memcache.py b/memcache.py index f9f2175..014d681 100644 --- a/memcache.py +++ b/memcache.py @@ -1415,56 +1415,31 @@ def __str__(self): class KetamaClient(Client): """ Memcach client with Consistent hashing support. - We are using the ketama algorithm to implement the consistent hashing. - - The ideal use case for this client is when your caching servers are - going to gets added or removed on time and you don't want to hot load - most of the hash keys at each time. If you are using consistent hashing - based client when ever there is change in the number of caching servers, - only very few percentage of cache miss happens across all servers. - By adjusting the SERVER_WEIGHT for your environment you can get least miss - rate. + Ketama is an implementation of a consistent hashing algorithm, meaning you + can add or remove servers from the memcached pool without causing a + complete remap of all keys. It was designed by Richard Jones. How Ketama Works: - Ketama algorithm uses very simple algorithm to achieve the consistent - hashing. What it does is that, - - 1. Preset the total number of Keys that we are going to save on the - hash server. eg; Total of 2 ** 16 or so. - 2. Logically we put all this hash keys on a ring in ascending order. - 3. For each server we give ring slots ( Hash key, or a position on the - ring.) Also we place same server in multiple places of the ring - to get better key distribution on a server. - 4. When we want to place a value on the hash server, we give a slot - for that vale on the ring, and then we find a next closest server - on the ring by searching clock-wise. And then we pick that server - to actually save key:value pair. - 5. Reading time, the same process happens. We find the ring slot for - the given key, and find the next server on the ring by searching - clock-wise. We know that we placed the value for that key on that - server. - 6. So when we add or remove one server, some server slots getting - removed from the ring or some new one gets added. After this update - there is chances that some keys will miss, since we stop searching - for another server on the ring once we get the first server by doing - the clock-wise lookup. But the miss rate will be #Keys / RING_SIZE. - In case of non-consistent hashing method, since the hash function - depends on the number of servers, the majority of the keys misses if - we add or remove server. + 1. Hash each server to several unsigned integer values. + 2. Conceptually, these numbers are placed on a ring. + 3. Each number links to the server it was hashed from, so servers + appear at several points on the ring. + 4. To map a key->server, hash the key to an unsigned integer and find + the next biggest number on the ring. That's your server. If + the number is too big, roll over to the first server in the ring + When a server is added or removed, only some keys will be remapped to + different servers. With the original modula algorithm, all keys + would have been remapped. TODO: Improve the documentation, add test cases. """ - # For this Consistent hashing client, the weight of the server means number - # of times the same server is placed on the different slots of the ketama - # hash key ring. This will make sure the each server have well normalized - # key distribution. + # For this Consistent hashing client, the weight of the server is the + # number of entries it will have in the ring. This will make sure + # each server has well normalized key distribution. DEFAULT_SERVER_WEIGHT = 200 # Total number of slots on the ring. - # If addition or deletion of a new server only causes 1 to 5 percentage - # cache miss on the current configuration. ie; K / RING_SIZE - # where K means total keys stored on the ring. - RING_SIZE = 2 ** 16 + RING_SIZE = 2**16 def __init__(self, *args, **kwargs): # Mapping between ring slot -> server. @@ -1475,53 +1450,38 @@ def __init__(self, *args, **kwargs): super(KetamaClient, self).__init__(*args, **kwargs) - def add_server(self, server): - """ - Add new server to the client. - - @param servers: server host in : format. - or in tuple of (:, weight) - """ - server_obj = memcache._Host( - server if isinstance(server, tuple) else ( - server, self.DEFAULT_SERVER_WEIGHT), - self.debug, dead_retry=self.dead_retry, - socket_timeout=self.socket_timeout, - flush_on_reconnect=self.flush_on_reconnect) - - self._place_server_on_ring(server_obj) - def _get_server(self, key): """ Get the memcache server corresponding to the given key. - Here we find the first server on the ring by searching clock-wise - from the given ring slot corresponding to the key. + @param key: key, or (server_hash, key) tuple if you want to specify + a hash to determine which server is selected - @param key: The input query. - - @return A tuple with (server_obj, key). + @return A tuple with (server_obj, key), or (None, None) if no servers + were available. """ # map the key on to the ring slot space. h_key = self._generate_ring_slot(key) + if isinstance(key, tuple): + serverhash, key = key + for slot in self._ketama_server_slots: if h_key <= slot: server = self._ketama_server_ring[slot] if server.connect(): return (server, key) - # Even after allocating the server, if the h_key won't fit - # on any server, then pick the first server on the ring. - server = (self._ketama_server_ring[self._ketama_server_slots[0]] - if self._ketama_server_slots else None) + # Roll over to the first available server + for server in self._ketama_server_ring.values(): + if server and server.connect(): + return (server, key) - server and server.connect() - return server, key + return (None, None) def set_servers(self, servers): """ - Add a pool of servers into the client. + Set servers for this client. @param servers: List of server hosts in : format. or @@ -1529,7 +1489,7 @@ def set_servers(self, servers): (:, weight) """ # Set the default weight if weight isn't passed. - self.servers = [memcache._Host( + self.servers = [_Host( s if isinstance(s, tuple) else (s, self.DEFAULT_SERVER_WEIGHT), self.debug, dead_retry=self.dead_retry, socket_timeout=self.socket_timeout, @@ -1541,10 +1501,9 @@ def set_servers(self, servers): def _place_server_on_ring(self, server): """ - Place given server on the ring. - - Based on the weight of the server, we generate multiple slots for - one key. This will give better key distribution. + Based on the weight of the server, generate multiple slots for + each server. This ensures when a server is added/remove keys won't all + remap to the same new server @param server: An instance of :class:~`memcache._Host`. """ @@ -1554,9 +1513,7 @@ def _place_server_on_ring(self, server): self._ketama_server_ring[slot] = server self._ketama_server_slots.append(slot) else: - # There is a key collection(<<<1% chance). - # Discarding this scenario now. - # TODO: Handle it. + # TODO: Handle collisions pass # Sort the server slot keys to make it a ring. @@ -1584,27 +1541,19 @@ def _get_server_slots_on_ring(self, server): def _generate_ring_slot(self, key): """ - Hash function which give random slots on the ring. Hash functon make - sure that the key distribution is even as much as possible. + Returns a slot in the ring for the given key. - @param key: Key which need to be mapped to the hash space. + @param key: Key which needs to be mapped to the ring. @type key: str - @return: hash key corresponding to the `key` + @return: hash value corresponding to the `key` """ - #TODO: Make it more general. - - # Simple hash method using python's internal hash algorithm. - #h_key = hash(key) & 0xffff - - # crc32 based hashing - #h_key = ((crc32(key) & 0xffffffff) >> 16) & 0xffff - - # For better randomness - h_key = ((crc32(key) & 0xffffffff)) & 0xffff - - return h_key + if isinstance(key, tuple): + serverhash, key = key + else: + serverhash = binascii.crc32(key.encode('ascii')) & 0xffffffff + return serverhash % self.RING_SIZE def _doctest(): import doctest From 51fdfc8a10361fed155af30692eac64fa3b6369d Mon Sep 17 00:00:00 2001 From: Sergio Martins Date: Thu, 26 Mar 2015 15:16:21 -0400 Subject: [PATCH 3/4] Added testing for KetamaClient --- tests/test_memcache.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_memcache.py b/tests/test_memcache.py index c12c528..9e233a2 100644 --- a/tests/test_memcache.py +++ b/tests/test_memcache.py @@ -2,7 +2,7 @@ from unittest import TestCase -from memcache import Client, SERVER_MAX_KEY_LENGTH +from memcache import Client, KetamaClient, SERVER_MAX_KEY_LENGTH try: _str_cls = basestring @@ -31,10 +31,10 @@ def __eq__(self, other): class TestMemcache(TestCase): - def setUp(self): + def setUp(self, client_class=Client): # TODO: unix socket server stuff servers = ["127.0.0.1:11211"] - self.mc = Client(servers, debug=1) + self.mc = client_class(servers, debug=1) pass def check_setget(self, key, val, noreply=False): @@ -119,6 +119,12 @@ def test_sending_key_too_long(self): self.mc.set('a' * SERVER_MAX_KEY_LENGTH, 1, noreply=True) +class TestMemcacheKetama(TestMemcache): + def setUp(self): + # Run all the tests again using the KetamaClient + super(TestMemcacheKetama, self).setUp(KetamaClient) + + if __name__ == "__main__": # failures = 0 # print("Testing docstrings...") From 3465e73632b702566a80e903dc3b88f5adb3e80a Mon Sep 17 00:00:00 2001 From: Sergio Martins Date: Thu, 26 Mar 2015 16:12:20 -0400 Subject: [PATCH 4/4] Fix format string issue in python2.6 --- memcache.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/memcache.py b/memcache.py index 014d681..068e4ab 100644 --- a/memcache.py +++ b/memcache.py @@ -1531,10 +1531,7 @@ def _get_server_slots_on_ring(self, server): server_slots = [] for i in range(0, server.weight): - # TODO: Keep a UUID id for each servers to avoid key collision. - server_key = "{}_{}".format("{}:{}".format(server.ip, - server.port), i) - + server_key = "%s:%d_%d" % (server.ip, server.port, i) server_slots.append(self._generate_ring_slot(server_key)) return server_slots