PoC: Network simulation framework (#1555)

* simv2: wip

* simulation: exec adapter start/stop

* simulation: add node status to exec adapter

* simulation: initial simulation code

* simulation: exec adapter, configure path to executable

* simulation: initial docker adapter

* simulation: wip kubernetes adapter

* simulation: kubernetes adapter proxy

* simulation: implement GetAll/StartAll/StopAll

* simulation: kuberentes adapter - set env vars and resource limits

* simulation: discovery test

* simulation: remove port definitions within docker adapter

* simulation: simplify wait for healthy loop

* simulation: get nat ip addr from interface

* simulation: pull docker images automatically

* simulation: NodeStatus -> NodeInfo

* simulation: move discovery test to example dir

* simulation: example snapshot usage

* simulation: add goclient specific simulation

* simulation: add peer connections to snapshot

* simulation: close rpc client

* simulation: don't export kubernetes proxy server

* simulation: merge simulation code

* simulation: don't export nodemap

* simulation: rename SimulationSnapshot -> Snapshot

* simulation: linting fixes

* simulation: add k8s available helper func

* simulation: vendor

* simulation: fix 'no non-test Go files' when building

* simulation: remove errors from interface methods where non were returned

* simulation: run getHealthInfo check in parallel
This commit is contained in:
Rafael Matias
2019-07-24 17:00:13 +02:00
committed by GitHub
parent 9720da34db
commit 388d8ccd9f
1570 changed files with 460774 additions and 3203 deletions

604
simulation/simulation.go Normal file
View File

@@ -0,0 +1,604 @@
package simulation
import (
"context"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"os"
"reflect"
"strings"
"sync"
"time"
"github.com/ethereum/go-ethereum/common/hexutil"
"github.com/ethereum/go-ethereum/crypto"
"github.com/ethereum/go-ethereum/p2p"
"github.com/ethereum/go-ethereum/rpc"
"github.com/ethersphere/swarm/log"
"github.com/ethersphere/swarm/network"
"golang.org/x/sync/errgroup"
)
type nodeMap struct {
sync.RWMutex
internal map[NodeID]Node
}
func newNodeMap() *nodeMap {
return &nodeMap{
internal: make(map[NodeID]Node),
}
}
func (nm *nodeMap) Load(key NodeID) (value Node, ok bool) {
nm.RLock()
result, ok := nm.internal[key]
nm.RUnlock()
return result, ok
}
func (nm *nodeMap) LoadAll() []Node {
nm.RLock()
v := []Node{}
for _, node := range nm.internal {
v = append(v, node)
}
nm.RUnlock()
return v
}
func (nm *nodeMap) Store(key NodeID, value Node) {
nm.Lock()
nm.internal[key] = value
nm.Unlock()
}
// Simulation is used to simulate a network of nodes
type Simulation struct {
adapter Adapter
nodes *nodeMap
}
// NewSimulation creates a new simulation given an adapter
func NewSimulation(adapter Adapter) *Simulation {
sim := &Simulation{
adapter: adapter,
nodes: newNodeMap(),
}
return sim
}
func getAdapterFromSnapshotConfig(snapshot *AdapterSnapshot) (Adapter, error) {
if snapshot == nil {
return nil, errors.New("snapshot can't be nil")
}
var adapter Adapter
var err error
switch t := snapshot.Type; t {
case "exec":
adapter, err = NewExecAdapter(snapshot.Config.(ExecAdapterConfig))
case "docker":
adapter, err = NewDockerAdapter(snapshot.Config.(DockerAdapterConfig))
case "kubernetes":
adapter, err = NewKubernetesAdapter(snapshot.Config.(KubernetesAdapterConfig))
default:
return nil, fmt.Errorf("unknown adapter type: %s", t)
}
if err != nil {
return nil, fmt.Errorf("could not initialize %s adapter: %v", snapshot.Type, err)
}
return adapter, nil
}
// NewSimulationFromSnapshot creates a simulation from a snapshot
func NewSimulationFromSnapshot(snapshot *Snapshot) (*Simulation, error) {
// Create adapter
adapter, err := getAdapterFromSnapshotConfig(snapshot.DefaultAdapter)
if err != nil {
return nil, err
}
sim := &Simulation{
adapter: adapter,
nodes: newNodeMap(),
}
// Loop over nodes and add them
for _, n := range snapshot.Nodes {
if n.Adapter == nil {
if err := sim.Init(n.Config); err != nil {
return sim, fmt.Errorf("failed to initialize node %v", err)
}
} else {
adapter, err := getAdapterFromSnapshotConfig(n.Adapter)
if err != nil {
return sim, fmt.Errorf("could not read adapter configureation for node %s: %v", n.Config.ID, err)
}
if err := sim.InitWithAdapter(n.Config, adapter); err != nil {
return sim, fmt.Errorf("failed to initialize node %s: %v", n.Config.ID, err)
}
}
}
// Start all nodes
err = sim.StartAll()
if err != nil {
return sim, err
}
// Establish connections
m := make(map[string]Node)
for _, n := range sim.GetAll() {
enode := removeNetworkAddressFromEnode(n.Info().Enode)
m[enode] = n
}
for _, con := range snapshot.Connections {
from, ok := m[con.From]
if !ok {
return sim, fmt.Errorf("no node found with enode: %s", con.From)
}
to, ok := m[con.To]
if !ok {
return sim, fmt.Errorf("no node found with enode: %s", con.To)
}
client, err := sim.RPCClient(from.Info().ID)
if err != nil {
return sim, err
}
defer client.Close()
if err := client.Call(nil, "admin_addPeer", to.Info().Enode); err != nil {
return sim, err
}
}
return sim, nil
}
func (s *AdapterSnapshot) detectConfigurationType() error {
adapterconfig, err := json.Marshal(s.Config)
if err != nil {
return err
}
switch t := s.Type; t {
case "exec":
var config ExecAdapterConfig
err := json.Unmarshal(adapterconfig, &config)
if err != nil {
return err
}
s.Config = config
case "docker":
var config DockerAdapterConfig
err := json.Unmarshal(adapterconfig, &config)
if err != nil {
return err
}
s.Config = config
case "kubernetes":
var config KubernetesAdapterConfig
err := json.Unmarshal(adapterconfig, &config)
if err != nil {
return err
}
s.Config = config
default:
return fmt.Errorf("unknown adapter type: %s", t)
}
return nil
}
func unmarshalSnapshot(data []byte, snapshot *Snapshot) error {
err := json.Unmarshal(data, snapshot)
if err != nil {
return err
}
// snapshot.Adapter.Config will be of type map[string]interface{}
// we have to unmarshal it to the correct adapter configuration struct
if err := snapshot.DefaultAdapter.detectConfigurationType(); err != nil {
return err
}
for _, n := range snapshot.Nodes {
if n.Adapter != nil {
if err := n.Adapter.detectConfigurationType(); err != nil {
return err
}
}
}
return nil
}
// LoadSnapshotFromFile loads a snapshot from a given JSON file
func LoadSnapshotFromFile(filePath string) (*Snapshot, error) {
f, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer f.Close()
bytes, err := ioutil.ReadAll(f)
if err != nil {
return nil, err
}
var snapshot Snapshot
err = unmarshalSnapshot(bytes, &snapshot)
if err != nil {
return nil, err
}
return &snapshot, nil
}
// Get returns a node by ID
func (s *Simulation) Get(id NodeID) (Node, error) {
node, ok := s.nodes.Load(id)
if !ok {
return nil, fmt.Errorf("a node with id %s does not exist", id)
}
return node, nil
}
// GetAll returns all nodes
func (s *Simulation) GetAll() []Node {
return s.nodes.LoadAll()
}
// DefaultAdapter returns the default adapter that the simulation was initialized with
func (s *Simulation) DefaultAdapter() Adapter {
return s.adapter
}
// Init initializes a node with the NodeConfig with the default Adapter
func (s *Simulation) Init(config NodeConfig) error {
return s.InitWithAdapter(config, s.DefaultAdapter())
}
// InitWithAdapter initializes a node with the NodeConfig and the given Adapter
func (s *Simulation) InitWithAdapter(config NodeConfig, adapter Adapter) error {
if _, ok := s.nodes.Load(config.ID); ok {
return fmt.Errorf("a node with id %s already exists", config.ID)
}
node := adapter.NewNode(config)
s.nodes.Store(config.ID, node)
return nil
}
// Start starts a node by ID
func (s *Simulation) Start(id NodeID) error {
node, ok := s.nodes.Load(id)
if !ok {
return fmt.Errorf("a node with id %s does not exist", id)
}
if err := node.Start(); err != nil {
return fmt.Errorf("could not start node: %v", err)
}
return nil
}
// Stop stops a node by ID
func (s *Simulation) Stop(id NodeID) error {
node, ok := s.nodes.Load(id)
if !ok {
return fmt.Errorf("a node with id %s does not exist", id)
}
if err := node.Stop(); err != nil {
return fmt.Errorf("could not stop node: %v", err)
}
return nil
}
// StartAll starts all nodes
func (s *Simulation) StartAll() error {
g, _ := errgroup.WithContext(context.Background())
for _, node := range s.nodes.LoadAll() {
g.Go(node.Start)
}
return g.Wait()
}
// StopAll stops all nodes
func (s *Simulation) StopAll() error {
g, _ := errgroup.WithContext(context.Background())
for _, node := range s.nodes.LoadAll() {
g.Go(node.Stop)
}
return g.Wait()
}
// RPCClient returns an RPC Client for a given node
func (s *Simulation) RPCClient(id NodeID) (*rpc.Client, error) {
node, ok := s.nodes.Load(id)
if !ok {
return nil, fmt.Errorf("a node with id %s does not exist", id)
}
info := node.Info()
var client *rpc.Client
var err error
for start := time.Now(); time.Since(start) < 10*time.Second; time.Sleep(50 * time.Millisecond) {
client, err = rpc.Dial(info.RPCListen)
if err == nil {
break
}
}
if client == nil {
return nil, fmt.Errorf("could not establish rpc connection: %v", err)
}
return client, nil
}
// HTTPBaseAddr returns the address for the HTTP API
func (s *Simulation) HTTPBaseAddr(id NodeID) (string, error) {
node, ok := s.nodes.Load(id)
if !ok {
return "", fmt.Errorf("a node with id %s does not exist", id)
}
info := node.Info()
return info.HTTPListen, nil
}
// Snapshot returns a snapshot of the simulation
func (s *Simulation) Snapshot() (*Snapshot, error) {
snap := Snapshot{}
// Default adapter snapshot
asnap := s.DefaultAdapter().Snapshot()
snap.DefaultAdapter = &asnap
// Nodes snapshot
nodes := s.GetAll()
snap.Nodes = make([]NodeSnapshot, len(nodes))
snap.Connections = []ConnectionSnapshot{}
for idx, n := range nodes {
ns, err := n.Snapshot()
if err != nil {
return nil, fmt.Errorf("failed to get nodes snapshot %s: %v", n.Info().ID, err)
}
// Don't need to specify the node's adapter snapshot if it's
// the same as the default adapters snapshot
if reflect.DeepEqual(asnap, *ns.Adapter) {
ns.Adapter = nil
}
snap.Nodes[idx] = ns
// Get connections
client, err := s.RPCClient(n.Info().ID)
if err != nil {
return nil, err
}
defer client.Close()
var peers []*p2p.PeerInfo
err = client.Call(&peers, "admin_peers")
if err != nil {
return nil, err
}
for _, p := range peers {
// Only care about outbound connections
if !p.Network.Inbound {
snap.Connections = append(snap.Connections, ConnectionSnapshot{
// we need to remove network addresses from enodes
// because they will change between simulations
From: removeNetworkAddressFromEnode(n.Info().Enode),
To: removeNetworkAddressFromEnode(p.Enode),
})
}
}
}
return &snap, nil
}
// AddBootnode adds and starts a bootnode with the given id and arguments
func (s *Simulation) AddBootnode(id NodeID, args []string) (Node, error) {
a := []string{
"--bootnode-mode",
"--bootnodes", "",
}
a = append(a, args...)
return s.AddNode(id, a)
}
// AddNode adds and starts a node with the given id and arguments
func (s *Simulation) AddNode(id NodeID, args []string) (Node, error) {
bzzkey, err := randomHexKey()
if err != nil {
return nil, err
}
nodekey, err := randomHexKey()
if err != nil {
return nil, err
}
a := []string{
"--bzzkeyhex", bzzkey,
"--nodekeyhex", nodekey,
}
a = append(a, args...)
cfg := NodeConfig{
ID: id,
Args: a,
// TODO: Figure out how to handle logs when using AddNode(...)
Stdout: ioutil.Discard,
Stderr: ioutil.Discard,
}
err = s.Init(cfg)
if err != nil {
return nil, err
}
err = s.Start(id)
if err != nil {
return nil, err
}
node, err := s.Get(id)
if err != nil {
return nil, err
}
return node, nil
}
// AddNodes adds and starts 'count' nodes with a given ID prefix, arguments.
// If the idPrefix is "node" and count is 3 then the following nodes will be
// created: node-0, node-1, node-2
func (s *Simulation) AddNodes(idPrefix string, count int, args []string) ([]Node, error) {
g, _ := errgroup.WithContext(context.Background())
idFormat := "%s-%d"
for i := 0; i < count; i++ {
id := NodeID(fmt.Sprintf(idFormat, idPrefix, i))
g.Go(func() error {
node, err := s.AddNode(id, args)
if err != nil {
log.Warn("Failed to add node", "id", id, "err", err.Error())
} else {
log.Info("Added node", "id", id, "enode", node.Info().Enode)
}
return err
})
}
err := g.Wait()
if err != nil {
return nil, err
}
nodes := make([]Node, count)
for i := 0; i < count; i++ {
id := NodeID(fmt.Sprintf(idFormat, idPrefix, i))
nodes[i], err = s.Get(id)
if err != nil {
return nil, err
}
}
return nodes, nil
}
// CreateClusterWithBootnode adds and starts a bootnode. Afterwards it will add and start 'count' nodes that connect
// to the bootnode. All nodes can be provided by custom arguments.
// If the idPrefix is "node" and count is 3 then you will have the following nodes created:
// node-bootnode, node-0, node-1, node-2.
// The bootnode will be the first node on the returned Node slice.
func (s *Simulation) CreateClusterWithBootnode(idPrefix string, count int, args []string) ([]Node, error) {
bootnode, err := s.AddBootnode(NodeID(fmt.Sprintf("%s-bootnode", idPrefix)), args)
if err != nil {
return nil, err
}
nodeArgs := []string{
"--bootnodes", bootnode.Info().Enode,
}
nodeArgs = append(nodeArgs, args...)
n, err := s.AddNodes(idPrefix, count, nodeArgs)
if err != nil {
return nil, err
}
nodes := []Node{bootnode}
nodes = append(nodes, n...)
return nodes, nil
}
// WaitForHealthyNetwork will block until all the nodes are considered
// to have a healthy kademlia table
func (s *Simulation) WaitForHealthyNetwork() error {
nodes := s.GetAll()
// Generate RPC clients
var clients struct {
RPC []*rpc.Client
mu sync.Mutex
}
clients.RPC = make([]*rpc.Client, len(nodes))
g, _ := errgroup.WithContext(context.Background())
for idx, node := range nodes {
node := node
idx := idx
g.Go(func() error {
id := node.Info().ID
client, err := s.RPCClient(id)
if err != nil {
return err
}
clients.mu.Lock()
clients.RPC[idx] = client
clients.mu.Unlock()
return nil
})
}
if err := g.Wait(); err != nil {
return err
}
for _, c := range clients.RPC {
defer c.Close()
}
// Generate addresses for PotMap
addrs := [][]byte{}
for _, node := range nodes {
byteaddr, err := hexutil.Decode(node.Info().BzzAddr)
if err != nil {
return err
}
addrs = append(addrs, byteaddr)
}
ppmap := network.NewPeerPotMap(network.NewKadParams().NeighbourhoodSize, addrs)
log.Info("Waiting for healthy kademlia...")
// Check for healthInfo on all nodes
for {
g, _ = errgroup.WithContext(context.Background())
for i := 0; i < len(nodes)-1; i++ {
i := i
g.Go(func() error {
log.Debug("Checking hive_getHealthInfo", "node", nodes[i].Info().ID)
healthy := &network.Health{}
if err := clients.RPC[i].Call(&healthy, "hive_getHealthInfo", ppmap[nodes[i].Info().BzzAddr[2:]]); err != nil {
return err
}
if !healthy.Healthy() {
return fmt.Errorf("node %s is not healthy", nodes[i].Info().ID)
}
return nil
})
}
err := g.Wait()
if err == nil {
break
}
log.Info("Not healthy yet...", "msg", err.Error())
time.Sleep(500 * time.Millisecond)
}
log.Info("Healthy kademlia on all nodes")
return nil
}
func randomHexKey() (string, error) {
key, err := crypto.GenerateKey()
if err != nil {
return "", err
}
keyhex := hex.EncodeToString(crypto.FromECDSA(key))
return keyhex, nil
}
func removeNetworkAddressFromEnode(enode string) string {
if idx := strings.Index(enode, "@"); idx != -1 {
return enode[:idx]
}
return enode
}