nova rabbitmq strange bug
Folks, I am running wallaby and Today i had rabbitMQ issue and i rebuild RabbitMQ. but now my all compute node throwing following error. not sure what is going on up they are not trying to come up and keep crashing with following error 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service [req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b - - - - -] Error starting thread.: oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last): 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 433, in get 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._queues[msg_id].get(block=True, timeout=timeout) 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 322, in get 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return waiter.wait() 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 141, in wait 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return get_hub().switch() 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/hubs/hub.py", line 313, in switch 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self.greenlet.switch() 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service _queue.Empty 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service During handling of the above exception, another exception occurred: 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last): 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_service/service.py", line 807, in run_service 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service service.start() 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/service.py", line 159, in start 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.manager.init_host() 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/compute/manager.py", line 1413, in init_host 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.driver.init_host(host=self.host) 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 792, in init_host 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self._register_instance_machine_type() 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 811, in _register_instance_machine_type 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service for instance in objects.InstanceList.get_by_host(context, hostname): 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_versionedobjects/base.py", line 175, in wrapper 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = cls.indirection_api.object_class_action_versions( 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/conductor/rpcapi.py", line 240, in object_class_action_versions 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return cctxt.call(context, 'object_class_action_versions', 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/rpc/client.py", line 175, in call 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.transport._send(self.target, msg_ctxt, msg, 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/transport.py", line 123, in _send 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._driver.send(target, ctxt, message, 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 680, in send 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._send(target, ctxt, message, wait_for_reply, timeout, 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 669, in _send 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = self._waiter.wait(msg_id, timeout, 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 559, in wait 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service message = self.waiters.get(msg_id, timeout=timeout) 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 435, in get 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service raise oslo_messaging.MessagingTimeout( 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8 2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
What do the rabbit logs reveal? Zitat von Satish Patel <satish.txt@gmail.com>:
Folks,
I am running wallaby and Today i had rabbitMQ issue and i rebuild RabbitMQ. but now my all compute node throwing following error. not sure what is going on up they are not trying to come up and keep crashing with following error
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service [req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b - - - - -] Error starting thread.: oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 433, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._queues[msg_id].get(block=True, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 322, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return waiter.wait()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 141, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return get_hub().switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/hubs/hub.py", line 313, in switch
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self.greenlet.switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service _queue.Empty
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service During handling of the above exception, another exception occurred:
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_service/service.py", line 807, in run_service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service service.start()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/service.py", line 159, in start
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.manager.init_host()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/compute/manager.py", line 1413, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.driver.init_host(host=self.host)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 792, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self._register_instance_machine_type()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 811, in _register_instance_machine_type
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service for instance in objects.InstanceList.get_by_host(context, hostname):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_versionedobjects/base.py", line 175, in wrapper
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = cls.indirection_api.object_class_action_versions(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/conductor/rpcapi.py", line 240, in object_class_action_versions
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return cctxt.call(context, 'object_class_action_versions',
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/rpc/client.py", line 175, in call
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.transport._send(self.target, msg_ctxt, msg,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/transport.py", line 123, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._driver.send(target, ctxt, message,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 680, in send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._send(target, ctxt, message, wait_for_reply, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 669, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = self._waiter.wait(msg_id, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 559, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service message = self.waiters.get(msg_id, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 435, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service raise oslo_messaging.MessagingTimeout(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
RabbitMQ logs are very clean and all info logging. It’s feel like something is broken between conductor and nova-compute. Compute is saying waiting for message ID. May be I rebuild rabbit during that time nova-conductor was keep trying to connect or doing something and when rabbit back then some message stuck in queue half way. I don’t know the flow so just making up some theory. All computes showing same logs but waiting for different message ID. Is nova-compute talk to conductor and wait for ack reply or not? Sent from my iPhone
On Feb 11, 2022, at 5:39 AM, Eugen Block <eblock@nde.ag> wrote:
What do the rabbit logs reveal?
Zitat von Satish Patel <satish.txt@gmail.com>:
Folks,
I am running wallaby and Today i had rabbitMQ issue and i rebuild RabbitMQ. but now my all compute node throwing following error. not sure what is going on up they are not trying to come up and keep crashing with following error
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service [req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b - - - - -] Error starting thread.: oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 433, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._queues[msg_id].get(block=True, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 322, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return waiter.wait()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 141, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return get_hub().switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/hubs/hub.py", line 313, in switch
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self.greenlet.switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service _queue.Empty
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service During handling of the above exception, another exception occurred:
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_service/service.py", line 807, in run_service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service service.start()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/service.py", line 159, in start
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.manager.init_host()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/compute/manager.py", line 1413, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.driver.init_host(host=self.host)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 792, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self._register_instance_machine_type()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 811, in _register_instance_machine_type
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service for instance in objects.InstanceList.get_by_host(context, hostname):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_versionedobjects/base.py", line 175, in wrapper
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = cls.indirection_api.object_class_action_versions(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/conductor/rpcapi.py", line 240, in object_class_action_versions
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return cctxt.call(context, 'object_class_action_versions',
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/rpc/client.py", line 175, in call
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.transport._send(self.target, msg_ctxt, msg,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/transport.py", line 123, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._driver.send(target, ctxt, message,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 680, in send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._send(target, ctxt, message, wait_for_reply, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 669, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = self._waiter.wait(msg_id, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 559, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service message = self.waiters.get(msg_id, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 435, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service raise oslo_messaging.MessagingTimeout(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
So 'rabbitmqctl cluster_status' show the expected output? Can you trace back the request ID (req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b) and see if it was completed? Usually I see nova recovering after a rabbit outage, this would be unexpected. One more thing, you wrote you're seeing these messages today but you pasted messages from yesterday. Is nova still logging those messages? Zitat von Satish Patel <satish.txt@gmail.com>:
RabbitMQ logs are very clean and all info logging. It’s feel like something is broken between conductor and nova-compute. Compute is saying waiting for message ID.
May be I rebuild rabbit during that time nova-conductor was keep trying to connect or doing something and when rabbit back then some message stuck in queue half way. I don’t know the flow so just making up some theory.
All computes showing same logs but waiting for different message ID. Is nova-compute talk to conductor and wait for ack reply or not?
Sent from my iPhone
On Feb 11, 2022, at 5:39 AM, Eugen Block <eblock@nde.ag> wrote:
What do the rabbit logs reveal?
Zitat von Satish Patel <satish.txt@gmail.com>:
Folks,
I am running wallaby and Today i had rabbitMQ issue and i rebuild RabbitMQ. but now my all compute node throwing following error. not sure what is going on up they are not trying to come up and keep crashing with following error
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service [req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b - - - - -] Error starting thread.: oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 433, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._queues[msg_id].get(block=True, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 322, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return waiter.wait()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 141, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return get_hub().switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/hubs/hub.py", line 313, in switch
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self.greenlet.switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service _queue.Empty
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service During handling of the above exception, another exception occurred:
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_service/service.py", line 807, in run_service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service service.start()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/service.py", line 159, in start
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.manager.init_host()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/compute/manager.py", line 1413, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.driver.init_host(host=self.host)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 792, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self._register_instance_machine_type()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 811, in _register_instance_machine_type
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service for instance in objects.InstanceList.get_by_host(context, hostname):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_versionedobjects/base.py", line 175, in wrapper
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = cls.indirection_api.object_class_action_versions(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/conductor/rpcapi.py", line 240, in object_class_action_versions
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return cctxt.call(context, 'object_class_action_versions',
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/rpc/client.py", line 175, in call
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.transport._send(self.target, msg_ctxt, msg,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/transport.py", line 123, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._driver.send(target, ctxt, message,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 680, in send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._send(target, ctxt, message, wait_for_reply, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 669, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = self._waiter.wait(msg_id, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 559, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service message = self.waiters.get(msg_id, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 435, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service raise oslo_messaging.MessagingTimeout(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
This is what I did, I am not very expert in rabbitMQ debugging where I can trace back message via ID or something. If there is a any good way then let me know I will give it a try. Finally I did following to recover after 24 hour. Shutdown all nova service first, api, conductor, scheduler etc. then I wipe out my rabbitMQ completely including mnesia and then rebuild cluster and then start nova service one by one. After that I didn’t see any issue in all my compute nodes. No error and nothing and they came back clean. Im trying to reproduce same incident in lab to see if I can get that condition. I don’t know what is the relationship here so I have to stop all nova service in order to rebuild rabbitMQ. We have 200 compute nodes in this cluster do you think that could be the issue because they are constantly trying to talk to rabbit and it was down? I have one more openstack deployment running on queens and it’s 250 node cloud and I rebuild rabbitMQ million times but never ever had issue like I had with wallaby. Sent from my iPhone
On Feb 11, 2022, at 9:11 AM, Eugen Block <eblock@nde.ag> wrote:
So 'rabbitmqctl cluster_status' show the expected output? Can you trace back the request ID (req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b) and see if it was completed? Usually I see nova recovering after a rabbit outage, this would be unexpected. One more thing, you wrote you're seeing these messages today but you pasted messages from yesterday. Is nova still logging those messages?
Zitat von Satish Patel <satish.txt@gmail.com>:
RabbitMQ logs are very clean and all info logging. It’s feel like something is broken between conductor and nova-compute. Compute is saying waiting for message ID.
May be I rebuild rabbit during that time nova-conductor was keep trying to connect or doing something and when rabbit back then some message stuck in queue half way. I don’t know the flow so just making up some theory.
All computes showing same logs but waiting for different message ID. Is nova-compute talk to conductor and wait for ack reply or not?
Sent from my iPhone
On Feb 11, 2022, at 5:39 AM, Eugen Block <eblock@nde.ag> wrote:
What do the rabbit logs reveal?
Zitat von Satish Patel <satish.txt@gmail.com>:
Folks,
I am running wallaby and Today i had rabbitMQ issue and i rebuild RabbitMQ. but now my all compute node throwing following error. not sure what is going on up they are not trying to come up and keep crashing with following error
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service [req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b - - - - -] Error starting thread.: oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 433, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._queues[msg_id].get(block=True, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 322, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return waiter.wait()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 141, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return get_hub().switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/hubs/hub.py", line 313, in switch
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self.greenlet.switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service _queue.Empty
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service During handling of the above exception, another exception occurred:
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_service/service.py", line 807, in run_service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service service.start()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/service.py", line 159, in start
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.manager.init_host()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/compute/manager.py", line 1413, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.driver.init_host(host=self.host)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 792, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self._register_instance_machine_type()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 811, in _register_instance_machine_type
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service for instance in objects.InstanceList.get_by_host(context, hostname):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_versionedobjects/base.py", line 175, in wrapper
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = cls.indirection_api.object_class_action_versions(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/conductor/rpcapi.py", line 240, in object_class_action_versions
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return cctxt.call(context, 'object_class_action_versions',
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/rpc/client.py", line 175, in call
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.transport._send(self.target, msg_ctxt, msg,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/transport.py", line 123, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._driver.send(target, ctxt, message,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 680, in send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._send(target, ctxt, message, wait_for_reply, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 669, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = self._waiter.wait(msg_id, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 559, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service message = self.waiters.get(msg_id, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 435, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service raise oslo_messaging.MessagingTimeout(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
Hi,
This is what I did, I am not very expert in rabbitMQ debugging where I can trace back message via ID or something. If there is a any good way then let me know I will give it a try.
you'll find the request ID across the respective services, so if you grep for that ID in the log neutron, nova, cinder, etc. directories you'll likely be able to find out which services tried to do what.
Im trying to reproduce same incident in lab to see if I can get that condition. I don’t know what is the relationship here so I have to stop all nova service in order to rebuild rabbitMQ. We have 200 compute nodes in this cluster do you think that could be the issue because they are constantly trying to talk to rabbit and it was down?
All openstack services try to talk to rabbit all the time, not only nova. You'll notice a "transport_url" in all service config files. And yes, if rabbit is down the services will complain. But as I already wrote, usually they recover eventually, although there might be some timeout you may have hit, I'm not sure. Or this is indeed an issue with Wallaby, I didn't have the chance to install it yet, the latest version I'm working with is Victoria, and there the recovery seems to work as expected. Have you check launchpad for existing bugs? Zitat von Satish Patel <satish.txt@gmail.com>:
This is what I did, I am not very expert in rabbitMQ debugging where I can trace back message via ID or something. If there is a any good way then let me know I will give it a try.
Finally I did following to recover after 24 hour. Shutdown all nova service first, api, conductor, scheduler etc. then I wipe out my rabbitMQ completely including mnesia and then rebuild cluster and then start nova service one by one.
After that I didn’t see any issue in all my compute nodes. No error and nothing and they came back clean.
Im trying to reproduce same incident in lab to see if I can get that condition. I don’t know what is the relationship here so I have to stop all nova service in order to rebuild rabbitMQ. We have 200 compute nodes in this cluster do you think that could be the issue because they are constantly trying to talk to rabbit and it was down?
I have one more openstack deployment running on queens and it’s 250 node cloud and I rebuild rabbitMQ million times but never ever had issue like I had with wallaby.
Sent from my iPhone
On Feb 11, 2022, at 9:11 AM, Eugen Block <eblock@nde.ag> wrote:
So 'rabbitmqctl cluster_status' show the expected output? Can you trace back the request ID (req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b) and see if it was completed? Usually I see nova recovering after a rabbit outage, this would be unexpected. One more thing, you wrote you're seeing these messages today but you pasted messages from yesterday. Is nova still logging those messages?
Zitat von Satish Patel <satish.txt@gmail.com>:
RabbitMQ logs are very clean and all info logging. It’s feel like something is broken between conductor and nova-compute. Compute is saying waiting for message ID.
May be I rebuild rabbit during that time nova-conductor was keep trying to connect or doing something and when rabbit back then some message stuck in queue half way. I don’t know the flow so just making up some theory.
All computes showing same logs but waiting for different message ID. Is nova-compute talk to conductor and wait for ack reply or not?
Sent from my iPhone
On Feb 11, 2022, at 5:39 AM, Eugen Block <eblock@nde.ag> wrote:
What do the rabbit logs reveal?
Zitat von Satish Patel <satish.txt@gmail.com>:
Folks,
I am running wallaby and Today i had rabbitMQ issue and i rebuild RabbitMQ. but now my all compute node throwing following error. not sure what is going on up they are not trying to come up and keep crashing with following error
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service [req-6c3cdaa3-ecf9-4bef-9def-6b6a69dfd30b - - - - -] Error starting thread.: oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 433, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._queues[msg_id].get(block=True, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 322, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return waiter.wait()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/queue.py", line 141, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return get_hub().switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/eventlet/hubs/hub.py", line 313, in switch
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self.greenlet.switch()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service _queue.Empty
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service During handling of the above exception, another exception occurred:
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service Traceback (most recent call last):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_service/service.py", line 807, in run_service
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service service.start()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/service.py", line 159, in start
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.manager.init_host()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/compute/manager.py", line 1413, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.driver.init_host(host=self.host)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 792, in init_host
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self._register_instance_machine_type()
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/virt/libvirt/driver.py", line 811, in _register_instance_machine_type
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service for instance in objects.InstanceList.get_by_host(context, hostname):
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_versionedobjects/base.py", line 175, in wrapper
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = cls.indirection_api.object_class_action_versions(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/nova/conductor/rpcapi.py", line 240, in object_class_action_versions
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return cctxt.call(context, 'object_class_action_versions',
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/rpc/client.py", line 175, in call
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service self.transport._send(self.target, msg_ctxt, msg,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/transport.py", line 123, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._driver.send(target, ctxt, message,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 680, in send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service return self._send(target, ctxt, message, wait_for_reply, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 669, in _send
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service result = self._waiter.wait(msg_id, timeout,
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 559, in wait
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service message = self.waiters.get(msg_id, timeout=timeout)
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service File "/openstack/venvs/nova-23.1.1/lib/python3.8/site-packages/oslo_messaging/_drivers/amqpdriver.py", line 435, in get
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service raise oslo_messaging.MessagingTimeout(
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service oslo_messaging.exceptions.MessagingTimeout: Timed out waiting for a reply to message ID 9510bd6c0d6a48e782ed7bc2887f32c8
2022-02-10 00:52:27.843 3660531 ERROR oslo_service.service
participants (2)
-
Eugen Block
-
Satish Patel