Add automatic restart for crashed worker processes
Workers are now monitored and automatically restarted when they crash. The worker pool validates addresses before returning them to skip stale entries from crashed workers. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -46,12 +46,32 @@ func (p *WorkerPool) SetWorkers(addrs []string) {
|
||||
}
|
||||
|
||||
// Acquire blocks until a worker is available and returns its address.
|
||||
// Validates that the worker is still in the current set before returning.
|
||||
func (p *WorkerPool) Acquire() (string, bool) {
|
||||
addr, ok := <-p.available
|
||||
if ok {
|
||||
log.Printf("[pool] Acquired worker %s", addr)
|
||||
for {
|
||||
addr, ok := <-p.available
|
||||
if !ok {
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Validate worker is still in current set (might have crashed)
|
||||
p.mu.Lock()
|
||||
valid := false
|
||||
for _, w := range p.workers {
|
||||
if w == addr {
|
||||
valid = true
|
||||
break
|
||||
}
|
||||
}
|
||||
p.mu.Unlock()
|
||||
|
||||
if valid {
|
||||
log.Printf("[pool] Acquired worker %s", addr)
|
||||
return addr, true
|
||||
}
|
||||
log.Printf("[pool] Skipping stale worker %s", addr)
|
||||
// Worker was removed, try next one
|
||||
}
|
||||
return addr, ok
|
||||
}
|
||||
|
||||
// Release marks a worker as available again after it finishes handling a request.
|
||||
@@ -73,3 +93,42 @@ func (p *WorkerPool) Release(addr string) {
|
||||
}
|
||||
// Worker not in current set (probably from before a rebuild), ignore
|
||||
}
|
||||
|
||||
// RemoveWorker removes a worker from the pool (e.g., when it crashes).
|
||||
// The worker will no longer receive requests.
|
||||
func (p *WorkerPool) RemoveWorker(addr string) {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
|
||||
// Remove from workers slice
|
||||
newWorkers := make([]string, 0, len(p.workers))
|
||||
for _, w := range p.workers {
|
||||
if w != addr {
|
||||
newWorkers = append(newWorkers, w)
|
||||
}
|
||||
}
|
||||
p.workers = newWorkers
|
||||
|
||||
log.Printf("[pool] Removed worker %s, remaining: %v", addr, p.workers)
|
||||
}
|
||||
|
||||
// AddWorker adds a worker to the pool and makes it available for requests.
|
||||
func (p *WorkerPool) AddWorker(addr string) {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
|
||||
// Check if already in pool
|
||||
for _, w := range p.workers {
|
||||
if w == addr {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
p.workers = append(p.workers, addr)
|
||||
select {
|
||||
case p.available <- addr:
|
||||
log.Printf("[pool] Added worker %s", addr)
|
||||
default:
|
||||
log.Printf("[pool] Added worker %s (channel full)", addr)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user