Sunday 3 January 2021

NodeJS child process sporadically stops

I apologise for the ambiguity of the question title but I am really stumped.

My app is a sport betting bot. It's a containerised app running on a DO box which spins up multiple NodeJS process' with the cluster module. The job of each child process that is spun up is to monitor a single sporting event throughout several betting exchanges (exchange1, exchange2, & exchange3).

The master process is the orchestrator of these child ones. On a setInterval, the master process gets the state of each live event. If an event has ended, it will send a message to the child process/worker monitoring that event to die. This was previously handled within the child process itself to completely sandbox them but didn't work out.

The app places 2 bets against each other to cover all outcomes. These 2 bets are supposed to be placed at the same time but, sadly, this is not always the case. If this happens, there is a process to rectify ("rectification process") this position to keep attempting the other side of the bet. This is where the problem occurs. Sporadically, a child process will just...stop (?). There are no logs after a certain point & there is no more logging from that child process. The rectification process works on a recursive setTimeout:

src/rectify.js

const logError = require('./log-error')
const logger = require('./logger/winston')
const Helpers = {
    exchange1: {
        tradeOut: require('./exchange1/trade-out'),
        retry: require('./exchange1/retry')
    },
    exchange2: {
        tradeOut: require('./exchange2/trade-out'),
        retry: require('./exchange2/retry'),
    },
    exchange3: {
        tradeOut: require('./exchange3/trade-out'),
        retry: require('./exchange3/retry')
    }
}

let resolutionAttempts = 0

module.exports = async function(bet, runner) {
    try {
        const _promise = new Promise((resolve, reject) => {
            async function rectify() {
                try {
                    const [lowEx, upEx] = [runner.exchange.toLowerCase(), runner.exchange.toUpperCase()]

                    resolutionAttempts++

                    const isAttemptTradeOut = resolutionAttempts % 2 === 0
                    const _attempt = isAttemptTradeOut ? 'TRADE OUT' : 'RETRY'
                    const _lossMsg = `${(resolutionLossAllowed * 100).toFixed(2)}% LOSS ALLOWED`

                    logger.info(`[ATTEMPTING ${_attempt} FOR ${upEx} WITH ${_lossMsg} (Process: ${process.pid})]`)
 
                    const runners = {
                        matched: { ... },
                        unmatched: { ... }
                    }
                    const result = isAttemptTradeOut
                        ? await Helpers[lowEx].tradeOut(bet, runners)
                        : await Helpers[lowEx].retry(bet, runners)

                    if (result) {
                        return resolve(result)
                    }
                    setTimeout(rectify, 10000) // 10 secs
                } catch(err) {
                    logError(err, __filename)

                    return reject(err)
                }
            }
            rectify()
        })
        const _res = await _promise

        return _res
    } catch(err) {
        logError(err, __filename)

        throw err
    }
})

An example of one of the retry.js files for an exchange (exchange3):

src/exchange3/retry.js

const logError = require('./log-error')
const logger = require('./logger/winston')
const api = require('./api')
const placeKeepBet = require('./place-keep-bet')

async function retry(bet, runners) {
    try {
        const betId = bet.id.toString()
        const response = await api.getPrices()
        const { prices } = response.data

        if (prices?.length) {
            const samePrice = prices.find(_price => {
                return _price.side === runners.unmatched.side
            })

            if (samePrice) {
                const odds = samePrice['decimal-odds'] || samePrice.odds
                const newArb = { ... }
                const unmatchedRunner = { ... }

                if (unmatchedRunner) {
                    const result = await placeKeepBet(bet)

                    if (result) {
                        return result
                    }
                } else {
                    logger.info(`[NO UNMATCHED RUNNER (EXCHANGE 3 | Bet: ${betId} | Process: ${process.pid})]`)
                }
            } else {
                logger.info(`[NO EXCHANGE 3 SAME PRICE (RETRY | Bet: ${betId} | Process: ${process.pid})]`)
            }
            return
        }
        logger.info(`[NO EXCHANGE 3 PRICES (RETRY | Bet: ${betId} | Process: ${process.pid})]`)

        const _response = await api.getMarket(eventId, marketId)
        const { status } = _response.data

        if (status === 'closed') {
            logger.info(`[EXCHANGE 3 MARKET HAS ENDED (RETRY | Bet: ${betId} | Process: ${process.pid})]`)

            return 'ENDED'
        }
    } catch (err) {
        logError(err, __filename)

        throw err
    }
}

module.exports = retry

src/exchange3/place-keep-bet.js

const logError = require('./log-error')
const logger = require('./logger/winston')
const api = require('./api')

async function placeKeepBet(bet) {
    const betId = bet.id.toString()
    const offer = { ... }
    const _attempt = 'RETRY'

    try {
        logger.info(`[EXCHANGE 3 OFFER (PLACE-KEEP-BET | ${_attempt} | Bet: ${betId} | Process: ${process.pid})]${JSON.stringify(offer)}`)

        const response = await api.submitOffers(offer)
        const _offer = response.data.offers[0]

        if (_offer.status === 'failed') {
            logger.error(`[PLACE BET ERROR (EXCHANGE 3 | PLACE-KEEP-BET | ${_attempt} | Bet: ${betId} | Process: ${process.pid})]`)

            return 'FAILED'
        }
        if (_offer.status === 'matched') {
            logger.info(`[MATCHED (EXCHANGE 3 | PLACE-KEEP-BET | ${_attempt} | Bet: ${betId} Process: ${process.pid})]`)

            return 'MATCHED'
        }
        if (['flushed', 'cancelled'].includes(_offer.status)) {
            logger.info(`[CANCELLED (EXCHANGE 3 | PLACE-KEEP-BET | ${_attempt} | Bet: ${betId} | Process: ${process.pid})]`)

            return 'CANCELLED'
        }
        if (['delayed', 'open'].includes(_offer.status)) {
            const [_stake, _remaining] = [_offer.stake || offer.offers[0].stake, _offer.remaining || offer.offers[0].stake]
            const _status = _remaining < _stake ? 'PARTIALLY_MATCHED' : 'UNMATCHED'

            logger.info(`[${_status} (EXCHANGE 3 | PLACE-KEEP-BET | ${_attempt} | Bet: ${betId} | Process: ${process.pid})]`)

            return 'UNMATCHED'
        }
        const _mresponse = await api.getMarket()
        const market = _mresponse?.data

        if (!['open', 'suspended'].includes(market?.status)) {
            return 'ENDED'
        }
        return 'FAILED'
    } catch (err) {
        logError(err, __filename)

        throw err
    }
}

module.exports = placeKeepBet

I have made sure to put Winston logs everywhere as I have encountered this issue so many times that I am trying to narrow it down to where it hits. Expected behaviour is to see logs like:

Expected:

{"message":"[ATTEMPTING RETRY FOR EXCHANGE 3 WITH 30.20% LOSS ALLOWED (Process: 174)]","level":"info","timestamp":"2020-12-29 17:38:16"}
{"message":"[EXCHANGE 3 OFFER (PLACE-KEEP-BET | RETRY | Bet: 1234 | Process: 174)]","level":"info","timestamp":"2020-12-29 17:38:32"}
{"message":"[MATCHED (EXCHANGE 3 | PLACE-KEEP-BET | RETRY | Bet: 1234 | Process: 174)]","level":"info","timestamp":"2020-12-29 17:38:38"}
{"message":"[EVENT HAS ENDED (Process: 174)]","level":"info","timestamp":"2020-12-29 18:25:01"}
{"message":"[EXITING WITH CODE 0 (Process: 174)]","level":"info","timestamp":"2020-12-29 18:25:01"}

The "EVENT HAS ENDED ..." message is logged from the master process that is polling the state of all the live events (as mentioned above). The "EXITING WITH CODE ..." message is logged by the child process acknowledging that the master process has told it to die as it's event has ended.

That is a basic logging example of what I could see. You'll be able to see there are logs everywhere throughout retry.js & place-keep-bet.js.

When this issue occurs of sporadically dying out, my logs are only:

Actual:

{"message":"[ATTEMPTING RETRY FOR EXCHANGE 3 WITH 30.20% LOSS ALLOWED (Process: 174)]","level":"info","timestamp":"2020-12-29 17:38:16"}
{"message":"[EVENT HAS ENDED (Process: 174)]","level":"info","timestamp":"2020-12-29 18:25:01"}

Even if there is no offer logged as the rectification is not possible for some reason, I would still expect more logs. I have logs throughout retry.js to inform me of what's happening. It just looks like the child process has frozen. The timeout callback has been invoked but it's just...stopped (?).

As @Bergi asked, am I sure that the master process is not killing the child process'? I am fairly sure as I have these hooks in src/worker/index.js (which is the entry point for a worker - child process):

const logger = require('../logger/winston')

module.exports = async function() {
     try {
        process.on('exit', code => {
            logger.info(`[EXITING WITH CODE ${code} (Process: ${process.pid})]`)
        })
        process.on('disconnect', () => {
            logger.info(`[WORKER DISCONNECTED (Process: ${process.pid})]`)
        })
     } catch(err) {
         // handle error
     }
}

In the latter example of the logs (Actual) that are not working correctly, the master process has logged that the event has ended but the child process has not. This makes me think that the child process has already exited but has not logged that it has? Is this possible?

As mentioned, this is a dockerised app running on a DO box with 8 vCPUs & 8 GB of mem. I previously thought that this was happening because the container reached it's resource constraints (I limit the memory the container can use - 5/8 GB) but looking at the time of the last log that's not working ("[ATTEMPTING RETRY ..."), only 5% CPU was being used & 18% of memory. It's also worth mentioning that the master process continued to log (as shown with the "[EVENT HAS ENDED ..." message) & that other child process' were spun up (& logging) whilst this one was stalled. I'm clueless as to why this is happening & has been happening for months.

At this point in time, there was about 10 events that were live so that would have been 10 child process' + 1 master + 1 for an API. However, sometimes there can be 50+ child process' running & @ 100% CPU usage but I will have no issue.

To provide a little more info on the "rectification process" as that's where the error lies/logs stop. I mention that the process is a recursive setTimeout. The example I gave above of the expected logs were that of best case scenario where the other side of the "matched bet" was matched instantly & the process succeeded on the 1st pass.

However, this is not always the case & could take many iterations of the process to be rectified. An example of logs here would be:

{"message":"[ATTEMPTING RETRY FOR EXCHANGE 3 WITH 30.20% LOSS ALLOWED (Process: 174)]","level":"info","timestamp":"2020-12-29 17:38:16"}
{"message":"[NO EXCHANGE 3 SAME PRICE (RETRY | Bet: 1234 | Process: 174)]","level":"info","timestamp":"2020-12-29 17:38:32"}
{"message":"[ATTEMPTING TRADE OUT FOR EXCHANGE 2 WITH 32.10% LOSS ALLOWED (Process: 174)]","level":"info","timestamp":"2020-12-29 17:48:28"}
{"message":"[NO EXCHANGE 2 RUNNER (RETRY | Bet: 1234 | Process: 174)]","level":"info","timestamp":"2020-12-29 17:48:34"}
{"message":"[ATTEMPTING RETRY FOR EXCHANGE 3 WITH 34.50% LOSS ALLOWED (Process: 174)]","level":"info","timestamp":"2020-12-29 17:58:37"}
{"message":"[EXCHANGE 3 OFFER (PLACE-KEEP-BET | RETRY | Bet: 1234 | Process: 174)]","level":"info","timestamp":"2020-12-29 17:58:38"}
{"message":"[MATCHED (EXCHANGE 3 | PLACE-KEEP-BET | RETRY | Bet: 1234 | Process: 174)]","level":"info","timestamp":"2020-12-29 17:58:43"}
{"message":"[EVENT HAS ENDED (Process: 174)]","level":"info","timestamp":"2020-12-29 18:25:01"}
{"message":"[EXITING WITH CODE 0 (Process: 174)]","level":"info","timestamp":"2020-12-29 18:25:01"}

The idea behind this was to keep incrementing the percentage "loss allowed" until the process finds a price that is viable to bet on.

The error behind no longer logging can happen at any time during this process. On the example this question is based on, it happened to fail on the 1st attempt but it can happen on the Nth attempt, it isn't just the 1st.



from NodeJS child process sporadically stops

No comments:

Post a Comment