1
2
3
4
5
6 import functools
7 import cPickle as pickle
8 import base64
9 import bz2
10 import optparse
11 import random
12 import platform
13 import collections
14 import sys
15 import store
16 import tangled.core as tc
17 import tangled.client
18 import tangled.server as ts
19 import vectorclock
20 import consistenthashing as chash
21
22 import logging
23 log = logging.getLogger("vinzclortho.core")
24
25 -class InvalidContext(Exception):
27
30 """
31 A wrapper that makes calls to a L{store.Store} be executed by a worker, and return L{tangled.core.Deferred}'s
32 """
33 - def __init__(self, worker, name, partition, persistent):
41
43 return "LocalStorage(%s)"%self.name
44
46 return self.worker.defer(functools.partial(self._store.get, key))
47
48 - def put(self, key, value):
49 return self.worker.defer(functools.partial(self._store.put, key, value))
50
52 return self.worker.defer(functools.partial(self._store.multi_put, kvlist, resolver))
53
55 return self.worker.defer(functools.partial(self._store.delete, key))
56
66
69
73
74 - def get_all(self, threshold, callback):
75 """This will call callback multiple times with a list of key/val tuples.
76 The callback will be called whenever threshold bytes is accumulated
77 (and also when all key/val tuples have been gathered). If the storage
78 is empty, the callback will be called with an empty list.
79
80 This does *not* return a Deferred!
81 """
82 d = self.worker.defer(self._store.get_iterator)
83 d.add_callbacks(functools.partial(self._iterator_ready, threshold, callback), self._iterate_error)
84
87 """A wrapper object that makes remote stores accessible just like local ones"""
89 self.address = address
90
92 return "RemoteStorage((%s, %d))"%self.address
93
95 if result.status == 200:
96 return result.data
97 else:
98 raise KeyError
99
100 - def _ok(self, result):
101 if result.status == 200:
102 return
103 else:
104 raise KeyError
105
106
107 - def get(self, key):
112
113 - def put(self, key, value):
118
124
127 """The request handler for requests to /_localstore/somekey"""
129 self.parent = context
130
132 return ts.Response(200, None, result)
133
134 - def _ok(self, result):
136
139
145
151
157
158 do_PUSH = do_PUT
159
162 """
163 The request handler for requests to /store/somekey. Implements the state
164 machines for quorum reads and writes. It also handles read-repair.
165 """
166 W = 2
167 R = 2
169 self.parent = context
170 self.results = []
171 self.failed = []
172
174 return bz2.compress(pickle.dumps((vc, value)))
175
177 return pickle.loads(bz2.decompress(blob))
178
179 - def _vc_to_context(self, vc):
180 return base64.b64encode(bz2.compress(pickle.dumps(vc)))
181
182 - def _context_to_vc(self, context):
183 return pickle.loads(bz2.decompress(base64.b64decode(context)))
184
186 """This returns a tuple with the following:
187
188 key
189 vectorclock (or None if context not provided)
190 client id (or address if not provided)
191 """
192 try:
193 client = request.headers["X-VinzClortho-ClientId"]
194 except KeyError:
195
196 client = request.client_address
197 try:
198 vc = self._context_to_vc(request.headers["X-VinzClortho-Context"])
199 except KeyError:
200 vc = None
201 return request.groups[0], vc, client
202
205
207 if len(self.results) + len(self.failed) == len(self.replicas):
208 resolved = self._resolve()
209 if resolved is None:
210
211 return
212 vc_final, value_final = resolved
213 for replica, result in self.results:
214 vc, value = result
215 if vc_final.descends_from(vc) and not vc.descends_from(vc_final):
216 log.info("Read-repair needed for %s", replica)
217 d = replica.put(self.key, self._encode(vc_final, value_final))
218 for replica, result in self.failed:
219 log.info("Read-repair of failed node %s", replica)
220 d = replica.put(self.key, self._encode(vc_final, value_final))
221 return result
222
224 return len(self.results) >= self.R
225
227 return len(self.results) >= self.W
228
230 return len(self.results) + len(self.failed) == len(self.replicas)
231
233 if self.response.called:
234 return
235 self.response.callback(ts.Response(404))
236
239
241 if self.response.called:
242 return
243 resolved = vectorclock.resolve_list([result for replica, result in self.results])
244 vc, value = resolved
245 context = self._vc_to_context(vc)
246 code = 200
247 if isinstance(value, list):
248 code = 300
249 self.response.callback(ts.Response(code, {"X-VinzClortho-Context": context}, value))
250
251 - def _get_ok(self, replica, result):
263
264 - def _fail(self, replica, result):
269
280
281 - def _ok(self, replica, result):
287
300
314
315 do_PUSH = do_PUT
316
330
332 """
333 The request handler for requests to /_handoff. This is used to send
334 a partition to its new owner.
335 """
337 self.context = context
338
341
349
351 """
352 The request handler for requests to /admin. Currently, these services are available:
353
354 /admin/claim
355
356 The number of partitions claimed by a node can be read/written using this.
357
358 /admin/balance
359
360 A PUT to this will make the node try to rebalance the claim of the nodes.
361 """
363 self.context = context
364
370
384
385 do_PUSH = do_PUT
386
388 """
389 The main object that contains the HTTP server and handles gossiping
390 of the consistent hash ring metadata.
391 """
392 gossip_interval=30.0
393 N=3
394 num_partitions=1024
395 worker_pool_size=10
396 - def __init__(self, addr, join, claim, partitions, logfile, persistent):
397
398 logfile = logfile or "vc_log_" + addr + ".log"
399 logging.basicConfig(level=logging.DEBUG,
400 format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
401 datefmt='%Y-%m-%d %H:%M:%S',
402 filename=logfile,
403 filemode='a')
404
405 console = logging.StreamHandler()
406 console.setLevel(logging.WARNING)
407 formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
408 console.setFormatter(formatter)
409 logging.getLogger('').addHandler(console)
410
411 log.info("Starting VinzClortho")
412
413 self.reactor = tc.Reactor()
414 self.workers = [tc.Worker(self.reactor, True) for i in range(self.worker_pool_size)]
415 self.address = split_str_addr(addr)
416 self.host, self.port = self.address
417 self.num_partitions = partitions or self.num_partitions
418 self.persistent = persistent
419 self._vcid = self.address
420 self._storage = {}
421 self._pending_shutdown_storage = {}
422 self._metadata = None
423 self._node = chash.Node(self.host, self.port)
424 self._claim = claim
425 self.create_ring(join)
426 self._server = ts.AsyncHTTPServer(self.address, self,
427 [(r"/store/(.*)", StoreHandler),
428 (r"/_localstore/(.*)", LocalStoreHandler),
429 (r"/_handoff", HandoffHandler),
430 (r"/_metadata", MetaDataHandler),
431 (r"/admin/(.*)", AdminHandler)])
432 self.reactor.call_later(self.check_shutdown, 30.0)
433
434 @property
436 return self._metadata[1]["ring"]
437
439
440
441 return self.workers[num % len(self.workers)]
442
444 if node.host == self.host and node.port == self.port:
445 return self.get_storage(key)
446 else:
447 return RemoteStorage((node.host, node.port))
448
450 if not self._storage and not self._pending_shutdown_storage:
451 for w in self.workers:
452 w.stop()
453 w.join()
454
455 sys.exit(0)
456 self.reactor.call_later(self.check_shutdown, 5.0)
457
459 return len(self._node.claim)
460
467
472
476
480
484
488
492 s = self.get_storage(kvlist[0][0])
493 return s.multi_put(kvlist, resolve)
494
498
508
510 meta = pickle.loads(bz2.decompress(response.data))
511 log.info("Gossip received from %s", address)
512 if self.update_meta(meta):
513 log.info("Update gossip @ %s", address)
514 url = "http://%s:%d/_metadata"%address
515 d = tangled.client.request(url, command="PUT", data=bz2.compress(pickle.dumps(self._metadata)))
516 d.add_both(self.gossip_sent)
517 else:
518 self.schedule_gossip()
519
558
560 other = [n for n in self.ring.nodes if n != self._node]
561 if len(other) == 0:
562 return None
563 n = other[random.randint(0, len(other)-1)]
564 return n.host, n.port
565
571
575
579
587
589 """Creates storages (if necessary) for all claimed partitions"""
590 for p in self._node.claim:
591 if p not in self._storage:
592 self._storage[p] = LocalStorage(self._get_worker(p), "%d@%s:%d"%(p, self.host, self.port), p, self.persistent)
593
595 if not kvlist:
596
597 log.debug("Shutdown partition %s", partition)
598 del self._pending_shutdown_storage[partition]
599 else:
600 url = "http://%s:%d/_handoff"%(node.host, node.port)
601 d = tangled.client.request(url, command="PUT", data=bz2.compress(pickle.dumps(kvlist)))
602 log.info("Handoff %d items from %s to %s", len(kvlist), partition, node)
603
605 log.debug("Handing off %s", partitions)
606 for p in partitions:
607 s = self._storage[p]
608 self._pending_shutdown_storage[p] = s
609 del self._storage[p]
610
611 s.get_all(1048576, functools.partial(self._partial_handoff, node, p))
612 return result
613
616
618 """Checks if any partitions that aren't claimed or replicated can be handed off"""
619 to_handoff = set(self._storage.keys()) - set(self._node.claim) - self.ring.replicated(self._node)
620 handoff_per_node = collections.defaultdict(list)
621 for p in to_handoff:
622 n = self.ring.partition_to_node(p)
623 handoff_per_node[n].append(p)
624 for n, plist in handoff_per_node.items():
625
626 d = tangled.client.request("http://%s:%d/_metadata"%(n.host, n.port))
627 d.add_callbacks(functools.partial(self.do_handoff, n, plist), self._handoff_error)
628
631
634 addr = str_addr.split(":")
635 host = addr[0]
636 try:
637 port = int(addr[1])
638 except IndexError:
639 port = 80
640 return host, port
641
643 parser = optparse.OptionParser()
644 parser.add_option("-a", "--address", dest="address", default="localhost:8080",
645 help="Bind to ADDRESS", metavar="ADDRESS")
646 parser.add_option("-j", "--join", dest="join",
647 help="Bind to ADDRESS", metavar="ADDRESS")
648 parser.add_option("-c", "--claim", dest="claim",
649 help="Number of partitions to claim")
650 parser.add_option("-p", "--partitions", dest="partitions", type="int",
651 help="Number of partitions in the hash ring")
652 parser.add_option("-l", "--logfile", dest="logfile", metavar="FILE",
653 help="Use FILE as logfile")
654 (options, args) = parser.parse_args()
655
656 vc = VinzClortho(options.address, options.join, options.claim, options.partitions, options.logfile, True)
657 vc.run()
658
659 if __name__ == '__main__':
660 main()
661