aboutsummaryrefslogtreecommitdiff
path: root/rumba/storyboard.py
blob: b6e07d6ca8019f56512af465ee1475b4b6b101f2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#
# A library to manage ARCFIRE experiments
#
#    Copyright (C) 2017 Nextworks S.r.l.
#    Copyright (C) 2017 imec
#
#    Sander Vrijders   <sander.vrijders@ugent.be>
#    Dimitri Staessens <dimitri.staessens@ugent.be>
#    Vincenzo Maffione <v.maffione@nextworks.it>
#    Marco Capitani    <m.capitani@nextworks.it>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., http://www.fsf.org/about/contact/.
#

# Base class for client apps
#
# @ap: Application Process binary
# @options: Options to pass to the binary
#
import os
import random
import time

import rumba.model as model
import rumba.ssh_support as ssh_support
import rumba.log as log

logger = log.get_logger(__name__)

try:
    from numpy.random import poisson
    from numpy.random import exponential
    logger.debug("Using numpy for faster and better random variables.")
except ImportError:
    from rumba.recpoisson import poisson

    def exponential(mean_duration):
        return random.expovariate(1.0 / mean_duration)

    logger.debug("Falling back to simple implementations.")
    # PROBLEM! These logs will almost never be printed...
    # But we might not care

current_id = -1


def get_id():
    global current_id
    current_id += 1
    return current_id


class Client(object):
    def __init__(self, ap, nodes=None, options=None, shutdown="kill <pid>"):
        self.ap = ap
        self.startup = (ap + ((" " + options) if options is not None else ""))
        if isinstance(nodes, model.Node):
            nodes = [nodes]
        elif nodes is None:
            nodes = []
        self.nodes = nodes
        self.shutdown = shutdown

    def add_node(self, node):
        if not isinstance(node, model.Node):
            raise Exception("A Node is required.")
        self.nodes.append(node)

    def process(self, duration):
        node = random.choice(self.nodes) if len(self.nodes) > 0 else None
        return ClientProcess(
            get_id(),
            self.ap,
            self.startup,
            duration,
            node,
            self.shutdown
        )


# Base class for client processes
#
# @ap: Application Process binary
# @duration: The time (in seconds) this process should run
# @start_time: The time at which this process is started.
# @options: Options to pass to the binary
#
class ClientProcess(object):
    def __init__(self, client_id, ap, startup, duration,
                 node=None, shutdown="<kill <pid>"):
        self.id = client_id
        self.ap = ap
        self.startup = startup
        self.duration = duration
        self.start_time = None
        self.running = False
        self.node = node
        self.pid = None
        self.shutdown = shutdown

    def run(self, node=None):
        if node is not None:
            self.node = node
        if self.node is None:
            raise Exception('No node specified for client %s' % (self.ap,))
        self.start_time = time.time()

        logger.debug(
            'Starting client app %s on node %s with duration %s.',
            self.ap, self.node.name, self.duration
        )

        start_cmd = "./startup.sh %s_%s %s" % (
            self.ap,
            self.id,
            self.startup.replace("<duration>", str(self.duration)),
        )
        self.running = True
        try:
            self.pid = self.node.execute_command(start_cmd)
        except ssh_support.SSHException:
            logger.warning('Could not start client %s on node %s.',
                           self.ap, self.node.name)
        logger.debug('Client app %s on node %s got pid %s.',
                     self.ap, self.node.name, self.pid)

    def stop(self):
        if self.shutdown != "":
            logger.debug(
                'Killing client %s on node %s.',
                self.ap, self.node.name
            )
            try:
                kill_cmd = self.shutdown.replace('<pid>', str(self.pid))
                self.node.execute_command(kill_cmd)
            except ssh_support.SSHException:
                    logger.warn('Could not kill client %s on node %s.',
                                self.ap, self.node.name)
        else:
            logger.debug(
                'Client %s on node %s has terminated.',
                self.ap, self.node.name
            )

    def check(self):
        """Check if the process should keep running, stop it if not,
        and return true if and only if it is still running."""
        now = time.time()
        if not self.running:
            return False
        if now - self.start_time >= self.duration:
            self.stop()
            self.running = False
            return False
        return True


# Base class for server programs
#
# @ap: Application Process binary
# @arrival_rate: Average requests/s to be received by this server
# @mean_duration: Average duration of a client connection (in seconds)
# @options: Options to pass to the binary
# @max_clients: Maximum number of clients to serve
# @clients: Client binaries that will use this server
# @nodes: Specific nodes to start this server on
#
class Server:
    def __init__(self, ap, arrival_rate, mean_duration,
                 options=None, max_clients=float('inf'),
                 clients=None, nodes=None, min_duration=2):
        self.ap = ap
        self.options = options if options is not None else ""
        self.max_clients = max_clients
        if clients is None:
            clients = list()
        self.clients = clients
        if nodes is None:
            nodes = []
        self.nodes = nodes
        self.arrival_rate = arrival_rate  # mean requests/s
        self.actual_parameter = max(mean_duration - min_duration, 0.1)
        # in seconds
        self.pids = {}
        self.min_duration = min_duration

    def add_client(self, client):
        self.clients.append(client)

    def del_client(self, client):
        self.clients.remove(client)

    def add_node(self, node):
        self.nodes.append(node)

    def del_node(self, node):
        self.nodes.remove(node)

    def get_new_clients(self, interval):
        """
        Returns a list of clients of size appropriate to the server's rate.

        The list's size should be a sample from Poisson(arrival_rate) over
        interval seconds.
        Hence, the average size should be interval * arrival_rate.
        """
        number = poisson(self.arrival_rate * interval)
        number = int(min(number, self.max_clients))
        return [self.make_client_process() for _ in range(number)]

    def make_client_process(self):
        """Returns a client of this server"""
        if len(self.clients) == 0:
            raise Exception("Server %s has empty client list." % (self,))
        duration = exponential(self.actual_parameter) + self.min_duration
        return random.choice(self.clients).process(
            duration=float("%.2f" % (duration,))
        )

    def run(self):
        for node in self.nodes:
            logfile = "%s_server.log" % self.ap
            script = r'nohup "$@" > %s 2>&1 & echo "$!"' % (logfile,)
            run_cmd = self.ap + (
                (" " + self.options) if self.options is not None else ""
            )
            cmd_1 = "echo '%s' > startup.sh && chmod a+x startup.sh" \
                    % (script,)
            cmd_2 = "./startup.sh %s" % (run_cmd,)
            logger.debug(
                'Starting server %s on node %s with logfile %s.',
                self.ap, node.name, logfile
            )
            try:
                node.execute_command(cmd_1)
                self.pids[node] = (node.execute_command(cmd_2))
            except ssh_support.SSHException:
                logger.warn('Could not start server %s on node %s.',
                            self.ap, node.name)

    def stop(self):
        for node, pid in self.pids.items():
            logger.debug(
                'Killing server %s on node %s.',
                self.ap, node.name
            )
            try:
                node.execute_command("kill %s" % pid)
            except ssh_support.SSHException:
                logger.warn('Could not kill server %s on node %s.',
                            self.ap, node.name)


# Base class for ARCFIRE storyboards
#
# @experiment: Experiment to use as input
# @duration: Duration of the whole storyboard
# @servers: App servers available in the network.
#           Type == Server or Type == List[Tuple[Server, Node]]
#
class StoryBoard:

    DEFAULT_INTERVAL = 2.5  # in seconds (may be a float)

    def __init__(self, duration, experiment=None, servers=None):
        self.experiment = experiment
        self.duration = duration
        self.servers = list()
        if servers is None:
            servers = list()
        for s in servers:
            self._validate_and_add_server(s)
        self.active_clients = []
        self.start_time = None

    def _validate_and_add_server(self, s):
        if self.experiment is None:
            raise ValueError("Cannot add a server before "
                             "setting the experiment.")
        if hasattr(s, '__len__') and len(s) == 2:
            server, node = s
            if not isinstance(server, Server) \
                    or not isinstance(node, model.Node):
                raise TypeError('First element must be of "Server" type, '
                                'second must be of "Node" type.')
            server.add_node(node)
            self.servers.append(server)
        elif type(s) == Server:
            self.servers.append(s)
        else:
            raise TypeError('Input servers should be either an object of '
                            '"Server" type or a Server-Node couple.')
        for node in self.servers[-1].nodes:
            if node not in self.experiment.nodes:
                raise ValueError('Cannot run server on node %s, '
                                 'not in experiment.' % (node.name,))

    def set_experiment(self, experiment):
        if not isinstance(experiment, model.Experiment):
            raise TypeError('Experiment instance required.')
        self.experiment = experiment

    def add_server(self, server):
        self._validate_and_add_server(server)

    def del_server(self, server):
        self.servers.remove(server)

    def start(self):
        self.start_time = time.time()
        script = r'logname="$1"; shift; nohup "${@}" ' \
                 r'> /tmp/${logname}.rumba.log 2>&1  & echo "$!"'
        logger.debug("Writing utility startup script on client nodes.")
        for server in self.servers:
            for client in server.clients:
                for node in client.nodes:
                    node.execute_command(
                        "echo '%s' > startup.sh && chmod a+x startup.sh"
                        % (script,)
                    )
        try:
            for server in self.servers:
                server.run()
            while time.time() - self.start_time < self.duration:
                for server in self.servers:
                    clients = server.get_new_clients(self.DEFAULT_INTERVAL)
                    for new_client in clients:  # type: ClientProcess
                        new_client.duration = min(
                            new_client.duration,
                            self.duration - (time.time() - self.start_time)
                        )
                        # Make sure the duration of the client does not
                        # go beyond the storyboard lifetime
                        if new_client.duration < server.min_duration:
                            continue
                            # Do not start clients that would not run for
                            # at least the minimum duration
                            # (due to sb constraints)
                        new_client.run()
                        self.active_clients.append(new_client)
                surviving = []
                for x in self.active_clients:
                    if x.check():
                        surviving.append(x)
                self.active_clients = surviving
                time.sleep(self.DEFAULT_INTERVAL)
            time.sleep(5)
            # Do a check that is supposed to find all remaining clients
            # as expired
            surviving = []
            for x in self.active_clients:
                if x.check():
                    surviving.append(x)
            self.active_clients = surviving
            if surviving:  # implied: is not empty
                logger.warning('Some clients could not be killed gracefully.')
        finally:  # Kill everything. No more mercy.
            for client in self.active_clients:
                client.stop()
            for server in self.servers:
                server.stop()

    def fetch_logs(self, local_dir='.'):
        if not os.path.isdir(local_dir):
            raise Exception('"%s" is not a directory. Cannot fetch logs.'
                            % local_dir)
        server_nodes = set()
        client_nodes = set()
        for server in self.servers:
            for node in server.nodes:
                server_nodes.add(node)
            for client in server.clients:
                for node in client.nodes:
                    client_nodes.add(node)
        for node in server_nodes:
            logs_list = node.execute_command('ls *_server.log')
            logger.info('Log list is:\n%s', logs_list)
            node.fetch_files(logs_list.split('\n'), local_dir)
        for node in client_nodes:
            logs_list = node.execute_command('ls /tmp/*.rumba.log '
                                             '|| echo ""')
            logger.info('Log list is:\n%s', logs_list)
            node.fetch_files(logs_list.split('\n'), local_dir)