Package vinzclortho :: Module core
[hide private]
[frames] | no frames]

Source Code for Module vinzclortho.core

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (c) 2001-2010 Pär Bohrarper. 
  4  # See LICENSE for details. 
  5   
  6  import functools 
  7  import cPickle as pickle 
  8  import base64 
  9  import bz2 
 10  import optparse 
 11  import random 
 12  import platform 
 13  import collections 
 14  import sys 
 15  import store 
 16  import tangled.core as tc 
 17  import tangled.client 
 18  import tangled.server as ts 
 19  import vectorclock 
 20  import consistenthashing as chash 
 21   
 22  import logging 
 23  log = logging.getLogger("vinzclortho.core") 
24 25 -class InvalidContext(Exception):
26 pass
27
28 29 -class LocalStorage(object):
30 """ 31 A wrapper that makes calls to a L{store.Store} be executed by a worker, and return L{tangled.core.Deferred}'s 32 """
33 - def __init__(self, worker, name, partition, persistent):
34 self.worker = worker 35 self.name = name 36 self.partition = partition 37 if persistent: 38 self._store = store.BerkeleyDBStore("vc_store_" + name + ".db") 39 else: 40 self._store = store.DictStore()
41
42 - def __str__(self):
43 return "LocalStorage(%s)"%self.name
44
45 - def get(self, key):
46 return self.worker.defer(functools.partial(self._store.get, key))
47
48 - def put(self, key, value):
49 return self.worker.defer(functools.partial(self._store.put, key, value))
50
51 - def multi_put(self, kvlist, resolver):
52 return self.worker.defer(functools.partial(self._store.multi_put, kvlist, resolver))
53
54 - def delete(self, key):
55 return self.worker.defer(functools.partial(self._store.delete, key))
56
57 - def _iterate_result(self, first, threshold, callback, result):
58 kvlist, iterator = result 59 if kvlist: 60 callback(kvlist) 61 d = self.worker.defer(functools.partial(self._store.iterate, iterator, threshold)) 62 d.add_callbacks(functools.partial(self._iterate_result, False, threshold, callback), self._iterate_error) 63 else: 64 if first: 65 callback(kvlist)
66
67 - def _iterate_error(self, failure):
68 failure.raise_exception()
69
70 - def _iterator_ready(self, threshold, callback, iterator):
71 d = self.worker.defer(functools.partial(self._store.iterate, iterator, threshold)) 72 d.add_callbacks(functools.partial(self._iterate_result, True, threshold, callback), self._iterate_error)
73
74 - def get_all(self, threshold, callback):
75 """This will call callback multiple times with a list of key/val tuples. 76 The callback will be called whenever threshold bytes is accumulated 77 (and also when all key/val tuples have been gathered). If the storage 78 is empty, the callback will be called with an empty list. 79 80 This does *not* return a Deferred! 81 """ 82 d = self.worker.defer(self._store.get_iterator) 83 d.add_callbacks(functools.partial(self._iterator_ready, threshold, callback), self._iterate_error)
84
85 86 -class RemoteStorage(object):
87 """A wrapper object that makes remote stores accessible just like local ones"""
88 - def __init__(self, address):
89 self.address = address
90
91 - def __str__(self):
92 return "RemoteStorage((%s, %d))"%self.address
93
94 - def _ok_get(self, result):
95 if result.status == 200: 96 return result.data 97 else: 98 raise KeyError
99
100 - def _ok(self, result):
101 if result.status == 200: 102 return 103 else: 104 raise KeyError
105 106
107 - def get(self, key):
108 host, port = self.address 109 d = tangled.client.request("http://%s:%d/_localstore/%s"%(host, port, key)) 110 d.add_callback(self._ok_get) 111 return d
112
113 - def put(self, key, value):
114 host, port = self.address 115 d = tangled.client.request("http://%s:%d/_localstore/%s"%(host, port, key), "PUT", value) 116 d.add_callback(self._ok) 117 return d
118
119 - def delete(self, key):
120 host, port = self.address 121 d = tangled.client.request("http://%s:%d/_localstore/%s"%(host, port, key), "DELETE") 122 d.add_callback(self._ok) 123 return d
124
125 126 -class LocalStoreHandler(object):
127 """The request handler for requests to /_localstore/somekey"""
128 - def __init__(self, context):
129 self.parent = context
130
131 - def _ok_get(self, result):
132 return ts.Response(200, None, result)
133
134 - def _ok(self, result):
135 return ts.Response(200)
136
137 - def _error(self, result):
138 return ts.Response(404)
139
140 - def do_GET(self, request):
141 key = request.groups[0] 142 d = self.parent.local_get(key) 143 d.add_callbacks(self._ok_get, self._error) 144 return d
145
146 - def do_PUT(self, request):
147 key = request.groups[0] 148 d = self.parent.local_put(key, request.data) 149 d.add_callbacks(self._ok, self._error) 150 return d
151
152 - def do_DELETE(self, request):
153 key = request.groups[0] 154 d = self.parent.local_delete(key) 155 d.add_callbacks(self._ok, self._error) 156 return d
157 158 do_PUSH = do_PUT
159
160 161 -class StoreHandler(object):
162 """ 163 The request handler for requests to /store/somekey. Implements the state 164 machines for quorum reads and writes. It also handles read-repair. 165 """ 166 W = 2 167 R = 2
168 - def __init__(self, context):
169 self.parent = context 170 self.results = [] 171 self.failed = []
172
173 - def _encode(self, vc, value):
174 return bz2.compress(pickle.dumps((vc, value)))
175
176 - def _decode(self, blob):
177 return pickle.loads(bz2.decompress(blob))
178
179 - def _vc_to_context(self, vc):
180 return base64.b64encode(bz2.compress(pickle.dumps(vc)))
181
182 - def _context_to_vc(self, context):
183 return pickle.loads(bz2.decompress(base64.b64decode(context)))
184
185 - def _extract(self, request):
186 """This returns a tuple with the following: 187 188 key 189 vectorclock (or None if context not provided) 190 client id (or address if not provided) 191 """ 192 try: 193 client = request.headers["X-VinzClortho-ClientId"] 194 except KeyError: 195 # Use the address as client id, if not provided 196 client = request.client_address 197 try: 198 vc = self._context_to_vc(request.headers["X-VinzClortho-Context"]) 199 except KeyError: 200 vc = None 201 return request.groups[0], vc, client
202
203 - def _resolve(self):
204 return vectorclock.resolve_list_extend([result for replica, result in self.results])
205
206 - def _read_repair(self, result):
207 if len(self.results) + len(self.failed) == len(self.replicas): 208 resolved = self._resolve() 209 if resolved is None: 210 # No replicas probably 211 return 212 vc_final, value_final = resolved 213 for replica, result in self.results: 214 vc, value = result 215 if vc_final.descends_from(vc) and not vc.descends_from(vc_final): 216 log.info("Read-repair needed for %s", replica) 217 d = replica.put(self.key, self._encode(vc_final, value_final)) 218 for replica, result in self.failed: 219 log.info("Read-repair of failed node %s", replica) 220 d = replica.put(self.key, self._encode(vc_final, value_final)) 221 return result
222
223 - def _read_quorum_acheived(self):
224 return len(self.results) >= self.R
225
226 - def _write_quorum_acheived(self):
227 return len(self.results) >= self.W
228
229 - def _all_received(self):
230 return len(self.results) + len(self.failed) == len(self.replicas)
231
232 - def _respond_error(self):
233 if self.response.called: 234 return 235 self.response.callback(ts.Response(404))
236
237 - def _respond_ok(self):
238 self.response.callback(ts.Response(200))
239
240 - def _respond_get_ok(self):
241 if self.response.called: 242 return 243 resolved = vectorclock.resolve_list([result for replica, result in self.results]) 244 vc, value = resolved 245 context = self._vc_to_context(vc) 246 code = 200 247 if isinstance(value, list): 248 code = 300 249 self.response.callback(ts.Response(code, {"X-VinzClortho-Context": context}, value))
250
251 - def _get_ok(self, replica, result):
252 result = self._decode(result) 253 # This handles deleted keys (TODO: this means concurrent deletes are lost, is this ok?) 254 if result is None: 255 return self._fail(replica, result) 256 # There was an actual value, handle it 257 self.results.append((replica, result)) 258 if self._read_quorum_acheived(): 259 self._respond_get_ok() 260 elif self._all_received(): 261 self._respond_error() 262 return result
263
264 - def _fail(self, replica, result):
265 self.failed.append((replica, result)) 266 if self._all_received(): 267 self._respond_error() 268 return result
269
270 - def do_GET(self, request):
271 self.response = tc.Deferred() 272 self.key = request.groups[0] 273 self.replicas = self.parent.get_replicas(self.key) 274 for r in self.replicas: 275 d = r.get(self.key) 276 d.add_callbacks(functools.partial(self._get_ok, r), 277 functools.partial(self._fail, r)) 278 d.add_both(self._read_repair) 279 return self.response
280
281 - def _ok(self, replica, result):
282 self.results.append((replica, result)) 283 if self._write_quorum_acheived(): 284 self._respond_ok() 285 elif self._all_received(): 286 self._respond_error()
287
288 - def do_PUT(self, request):
289 self.response = tc.Deferred() 290 key, vc, client = self._extract(request) 291 self.replicas = self.parent.get_replicas(key) 292 vc = vc or vectorclock.VectorClock() 293 vc.increment(client) 294 value = self._encode(vc, request.data) 295 for r in self.replicas: 296 d = r.put(key, value) 297 d.add_callbacks(functools.partial(self._ok, r), 298 functools.partial(self._fail, r)) 299 return self.response
300
301 - def do_DELETE(self, request):
302 self.response = tc.Deferred() 303 key, vc, client = self._extract(request) 304 self.replicas = self.parent.get_replicas(key) 305 vc = vc or vectorclock.VectorClock() 306 vc.increment(client) 307 value = self._encode(vc, None) 308 for r in self.replicas: 309 # delete is handled as a put of None 310 d = r.put(key, value) 311 d.add_callbacks(functools.partial(self._ok, r), 312 functools.partial(self._fail, r)) 313 return self.response
314 315 do_PUSH = do_PUT
316
317 -class MetaDataHandler(object):
318 """The request handler for requests to /_metadata. Used when gossiping."""
319 - def __init__(self, context):
320 self.context = context
321
322 - def do_GET(self, request):
323 log.info("Metadata requested by %s", request.client_address) 324 return tc.succeed(ts.Response(200, None, bz2.compress(pickle.dumps(self.context._metadata))))
325
326 - def do_PUT(self, request):
327 log.info("Metadata submitted by %s", request.client_address) 328 self.context.update_meta(pickle.loads(bz2.decompress(request.data))) 329 return tc.succeed(ts.Response(200, None, None))
330
331 -class HandoffHandler(object):
332 """ 333 The request handler for requests to /_handoff. This is used to send 334 a partition to its new owner. 335 """
336 - def __init__(self, context):
337 self.context = context
338
339 - def _put_complete(self, result):
340 return ts.Response(200, None, None)
341
342 - def do_PUT(self, request):
343 kvlist = pickle.loads(bz2.decompress(request.data)) 344 if not kvlist: 345 return tc.succeed(ts.Response(200, None, None)) 346 d = self.context.local_multi_put(kvlist) 347 d.add_both(self._put_complete) 348 return d
349
350 -class AdminHandler(object):
351 """ 352 The request handler for requests to /admin. Currently, these services are available: 353 354 /admin/claim 355 356 The number of partitions claimed by a node can be read/written using this. 357 358 /admin/balance 359 360 A PUT to this will make the node try to rebalance the claim of the nodes. 361 """
362 - def __init__(self, context):
363 self.context = context
364
365 - def do_GET(self, request):
366 service = request.groups[0] 367 if service == "claim": 368 return tc.succeed(ts.Response(200, None, str(self.context.get_claim()))) 369 return tc.succeed(ts.Response(404))
370
371 - def do_PUT(self, request):
372 service = request.groups[0] 373 if service == "claim": 374 try: 375 claim = int(request.data) 376 self.context.update_claim(claim) 377 return tc.succeed(ts.Response(200)) 378 except ValueError: 379 return tc.succeed(ts.Response(400)) 380 elif service == "balance": 381 self.context.balance() 382 return tc.succeed(ts.Response(200)) 383 return tc.succeed(ts.Response(404))
384 385 do_PUSH = do_PUT
386
387 -class VinzClortho(object):
388 """ 389 The main object that contains the HTTP server and handles gossiping 390 of the consistent hash ring metadata. 391 """ 392 gossip_interval=30.0 393 N=3 394 num_partitions=1024 395 worker_pool_size=10
396 - def __init__(self, addr, join, claim, partitions, logfile, persistent):
397 # Setup logging 398 logfile = logfile or "vc_log_" + addr + ".log" 399 logging.basicConfig(level=logging.DEBUG, 400 format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 401 datefmt='%Y-%m-%d %H:%M:%S', 402 filename=logfile, 403 filemode='a') 404 # define a Handler which writes WARNING messages or higher to the sys.stderr 405 console = logging.StreamHandler() 406 console.setLevel(logging.WARNING) 407 formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') 408 console.setFormatter(formatter) 409 logging.getLogger('').addHandler(console) 410 411 log.info("Starting VinzClortho") 412 413 self.reactor = tc.Reactor() 414 self.workers = [tc.Worker(self.reactor, True) for i in range(self.worker_pool_size)] 415 self.address = split_str_addr(addr) 416 self.host, self.port = self.address 417 self.num_partitions = partitions or self.num_partitions 418 self.persistent = persistent 419 self._vcid = self.address 420 self._storage = {} 421 self._pending_shutdown_storage = {} 422 self._metadata = None 423 self._node = chash.Node(self.host, self.port) 424 self._claim = claim 425 self.create_ring(join) 426 self._server = ts.AsyncHTTPServer(self.address, self, 427 [(r"/store/(.*)", StoreHandler), 428 (r"/_localstore/(.*)", LocalStoreHandler), 429 (r"/_handoff", HandoffHandler), 430 (r"/_metadata", MetaDataHandler), 431 (r"/admin/(.*)", AdminHandler)]) 432 self.reactor.call_later(self.check_shutdown, 30.0)
433 434 @property
435 - def ring(self):
436 return self._metadata[1]["ring"]
437
438 - def _get_worker(self, num):
439 # The idea is to get the same worker for a partition, to avoid 440 # threading issues 441 return self.workers[num % len(self.workers)]
442
443 - def _get_replica(self, node, key):
444 if node.host == self.host and node.port == self.port: 445 return self.get_storage(key) 446 else: 447 return RemoteStorage((node.host, node.port))
448
449 - def check_shutdown(self):
450 if not self._storage and not self._pending_shutdown_storage: 451 for w in self.workers: 452 w.stop() 453 w.join() 454 # fugly way, but it works 455 sys.exit(0) 456 self.reactor.call_later(self.check_shutdown, 5.0)
457
458 - def get_claim(self):
459 return len(self._node.claim)
460
461 - def balance(self):
462 self.ring.update_claim() 463 self._metadata[0].increment(self._vcid) 464 self.reactor.call_later(self.update_storage, 0.0) 465 self.reactor.call_later(self.check_handoff, 0.0) 466 self.schedule_gossip(0.0)
467
468 - def update_claim(self, claim):
469 force = (claim == 0) 470 self.ring.update_node(self._node, claim, force) 471 self.balance()
472
473 - def get_replicas(self, key):
474 preferred, fallbacks = self.ring.preferred(key) 475 return [self._get_replica(n, key) for n in preferred]
476
477 - def get_storage(self, key):
478 p = self.ring.key_to_partition(key) 479 return self._storage.setdefault(p, LocalStorage(self._get_worker(p), "%d@%s:%d"%(p, self.host, self.port), p, self.persistent))
480
481 - def local_get(self, key):
482 s = self.get_storage(key) 483 return s.get(key)
484
485 - def local_put(self, key, value):
486 s = self.get_storage(key) 487 return s.put(key, value)
488
489 - def local_multi_put(self, kvlist):
490 def resolve(a, b): 491 return vectorclock.resolve_list_extend([a, b])
492 s = self.get_storage(kvlist[0][0]) 493 return s.multi_put(kvlist, resolve)
494
495 - def local_delete(self, key):
496 s = self.get_storage(key) 497 return s.delete(key)
498
499 - def create_ring(self, join):
500 if join: 501 self.get_gossip(split_str_addr(join)) 502 else: 503 vc = vectorclock.VectorClock() 504 vc.increment(self._vcid) 505 self._metadata = (vc, {"ring": chash.Ring(self.num_partitions, self._node, self.N)}) 506 self.reactor.call_later(self.update_storage, 0.0) 507 self.schedule_gossip()
508
509 - def gossip_received(self, address, response):
510 meta = pickle.loads(bz2.decompress(response.data)) 511 log.info("Gossip received from %s", address) 512 if self.update_meta(meta): 513 log.info("Update gossip @ %s", address) 514 url = "http://%s:%d/_metadata"%address 515 d = tangled.client.request(url, command="PUT", data=bz2.compress(pickle.dumps(self._metadata))) 516 d.add_both(self.gossip_sent) 517 else: 518 self.schedule_gossip()
519
520 - def update_meta(self, meta):
521 old = False 522 updated = False 523 524 # Update metadata as needed 525 if self._metadata is None: 526 self._metadata = meta 527 updated = True 528 else: 529 vc_new, meta_new = meta 530 vc_curr, meta_curr = self._metadata 531 if vc_new.descends_from(vc_curr): 532 if vc_new != vc_curr: 533 log.debug("Received metadata is new %s", meta) 534 # Accept new metadata 535 self._metadata = meta 536 updated = True 537 else: 538 log.debug("Received metadata is the same") 539 else: 540 log.debug("Received metadata is old") 541 old = True 542 # Reconcile? 543 544 # Add myself if needed 545 # this just compares host and port, not the claim.. 546 if self._node not in self.ring.nodes: 547 self.ring.add_node(self._node, self._claim) 548 self._metadata[0].increment(self._vcid) 549 updated = True 550 old = True 551 552 if updated: 553 # Grab the node since it might have been updated 554 self._node = self.ring.get_node(self._node.name) 555 self.reactor.call_later(self.update_storage, 0.0) 556 self.reactor.call_later(self.check_handoff, 0.0) 557 return old
558
559 - def random_other_node_address(self):
560 other = [n for n in self.ring.nodes if n != self._node] 561 if len(other) == 0: 562 return None 563 n = other[random.randint(0, len(other)-1)] 564 return n.host, n.port
565
566 - def schedule_gossip(self, timeout=None):
567 log.debug("Gossip scheduled, %s", timeout) 568 if timeout is None: 569 timeout = self.gossip_interval 570 self.reactor.call_later(self.get_gossip, timeout)
571
572 - def gossip_sent(self, result):
573 log.debug("Gossip sent") 574 self.schedule_gossip()
575
576 - def gossip_error(self, result):
577 log.error("Gossip error: %s", result) 578 self.schedule_gossip()
579
580 - def get_gossip(self, a=None):
581 address = a or self.random_other_node_address() 582 if address is not None: 583 log.debug("Gossip with %s", address) 584 d = tangled.client.request("http://%s:%d/_metadata"%address) 585 d.add_callbacks(functools.partial(self.gossip_received, address), self.gossip_error) 586 return d
587
588 - def update_storage(self):
589 """Creates storages (if necessary) for all claimed partitions""" 590 for p in self._node.claim: 591 if p not in self._storage: 592 self._storage[p] = LocalStorage(self._get_worker(p), "%d@%s:%d"%(p, self.host, self.port), p, self.persistent)
593
594 - def _partial_handoff(self, node, partition, kvlist):
595 if not kvlist: 596 # TODO: remove the db file etc 597 log.debug("Shutdown partition %s", partition) 598 del self._pending_shutdown_storage[partition] 599 else: 600 url = "http://%s:%d/_handoff"%(node.host, node.port) 601 d = tangled.client.request(url, command="PUT", data=bz2.compress(pickle.dumps(kvlist))) 602 log.info("Handoff %d items from %s to %s", len(kvlist), partition, node)
603
604 - def do_handoff(self, node, partitions, result):
605 log.debug("Handing off %s", partitions) 606 for p in partitions: 607 s = self._storage[p] 608 self._pending_shutdown_storage[p] = s 609 del self._storage[p] 610 # Send the partitions in 1MB chunks 611 s.get_all(1048576, functools.partial(self._partial_handoff, node, p)) 612 return result
613
614 - def _handoff_error(self, failure):
615 failure.raise_exception()
616
617 - def check_handoff(self):
618 """Checks if any partitions that aren't claimed or replicated can be handed off""" 619 to_handoff = set(self._storage.keys()) - set(self._node.claim) - self.ring.replicated(self._node) 620 handoff_per_node = collections.defaultdict(list) 621 for p in to_handoff: 622 n = self.ring.partition_to_node(p) 623 handoff_per_node[n].append(p) 624 for n, plist in handoff_per_node.items(): 625 # request the metadata to see if it's alive 626 d = tangled.client.request("http://%s:%d/_metadata"%(n.host, n.port)) 627 d.add_callbacks(functools.partial(self.do_handoff, n, plist), self._handoff_error)
628
629 - def run(self):
630 self.reactor.loop()
631
632 633 -def split_str_addr(str_addr):
634 addr = str_addr.split(":") 635 host = addr[0] 636 try: 637 port = int(addr[1]) 638 except IndexError: 639 port = 80 640 return host, port
641
642 -def main():
643 parser = optparse.OptionParser() 644 parser.add_option("-a", "--address", dest="address", default="localhost:8080", 645 help="Bind to ADDRESS", metavar="ADDRESS") 646 parser.add_option("-j", "--join", dest="join", 647 help="Bind to ADDRESS", metavar="ADDRESS") 648 parser.add_option("-c", "--claim", dest="claim", 649 help="Number of partitions to claim") 650 parser.add_option("-p", "--partitions", dest="partitions", type="int", 651 help="Number of partitions in the hash ring") 652 parser.add_option("-l", "--logfile", dest="logfile", metavar="FILE", 653 help="Use FILE as logfile") 654 (options, args) = parser.parse_args() 655 656 vc = VinzClortho(options.address, options.join, options.claim, options.partitions, options.logfile, True) 657 vc.run()
658 659 if __name__ == '__main__': 660 main() 661