Discussion:
PROBLEM Service Alert: hostname/check_postgres_old_transaction is CRITICAL **
Adarsh Sharma
2014-09-19 18:33:30 UTC
Permalink
Hi all,
From the past few days i am getting this old transation alert from some of
my DB slaves. I am using EDB9.2.7.18 on on CentOS6.5 ( Final ). From my
app boxes i got some of the transactions remains stuck for 2-3 hours which
took ~ < 60 seconds in database. Below are the details :-

-- DB execution time
explain analyze select * from appusage_func('2014-09-19 15:26:15.662');

-[ RECORD 1
]--------------------------------------------------------------------------------------------------------------------------------------
QUERY PLAN | Function Scan on appusage_func (cost=0.25..10.25 rows=1000
width=12) (actual time=34060.657..34813.542 rows=7362310 loops=1)
-[ RECORD 2
]--------------------------------------------------------------------------------------------------------------------------------------
QUERY PLAN | Total runtime: 35174.296 ms

It returns approx *311 MB* data to the client servers.

root > netstat -p | grep 45355
tcp 0 1531648 localhost:5499
localhost:48848 ESTABLISHED 45355/postgres

root > strace -p 45355
-- Remain stuck for *2-3 *hours
read(306,
"\30\0kB\3\0\0\0\0\0\377\5\0\0\26\0\0\0\2\0\0\0\30\0+\264\3\0\0\0\0\0"...,
8192) = 8192
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0

Then after 2-3 hours it got completed automatically with below stacktrace :

# strace -p 45355
Process 45355 attached - interrupt to quit
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0

) = -1 ECONNRESET (Connection reset by peer)
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT
(No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No
such file or directory)
write(2, "\0\0J\1+\261\0\0t\35\230\2\1\0\0\0002014-09-19 18:04"..., 346) =
346
write(2, "\0\0.\1+\261\0\0t18:\1\0\0\0002014-09-19 18:04"..., 318) = 318
close(306) = 0
stat("base/pgsql_tmp/pgsql_tmp45355.36", {st_mode=S_IFREG|0600,
st_size=161970820, ...}) = 0
unlink("base/pgsql_tmp/pgsql_tmp45355.36") = 0
sendto(9, "\17\0\0\0\30\0\0\0\230\275\22\0\0\0\0\0\204z\247\t\0\0\0\0", 24,
0, NULL, 0) = 24
write(2, "\0\0\256\0+\261\0\0tFf\0\1\0\0\0002014-09-19 18:04"..., 190) = 190
close(305) = 0
close(264) = 0
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\1\0\0\0\1\0\0\0b\366\7\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0\250\3\0\0\230\275\22\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
936, 0, NULL, 0) = 936
sendto(9,
"\2\0\0\0x\1\0\0\230\275\22\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
376, 0, NULL, 0) = 376
sendto(9,
"\f\0\0\0000\0\0\0\230\275\22\0\1\0\0\0e\346_\0002047\1\0\0\0\0\0\0\0"...,
48, 0, NULL, 0) = 48
write(2, "\0\0\325\0+\261\0\0t\35\230\2\1\0\0\0002014-09-19 18:04"..., 229)
= 229
exit_group(1) = ?
Process 45355 detached

I am sure there will be network issue but what might be the reason for
that. It comes 2-3 times daily.

Please let me know if any other information is required.

Thanks
Jan-Pieter Cornet
2014-09-19 22:47:29 UTC
Permalink
Post by Adarsh Sharma
It returns approx *311 MB* data to the client servers.
root > netstat -p | grep 45355
tcp 0 1531648 localhost:5499 localhost:48848 ESTABLISHED 45355/postgres
root > strace -p 45355
-- Remain stuck for *2-3 *hours
read(306, "\30\0kB\3\0\0\0\0\0\377\5\0\0\26\0\0\0\2\0\0\0\30\0+\264\3\0\0\0\0\0"..., 8192) = 8192
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"..., 8192, 0, NULL, 0
# strace -p 45355
Process 45355 attached - interrupt to quit
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"..., 8192, 0, NULL, 0
) = -1 ECONNRESET (Connection reset by peer)
I'd guess that the receiving program crashes. It might not be able to stomach the 311 MB that's being sent to it, maybe it's expanding, and swapping, until it's killed by the OOM killer?

Doesn't look like a postgres problem to me. postgres is stuck trying to send data... try stracing the client to see what it does? (use "lsof -i :48848" or whatever the port number of the remote is to find the pid).
--
Jan-Pieter Cornet <***@xs4all.net>
"Any sufficiently advanced incompetence is indistinguishable from malice."
- Grey's Law
Adarsh Sharma
2014-09-24 05:52:17 UTC
Permalink
Thanks Jan.!! Will check and update you all the findings.

Cheers
Post by Adarsh Sharma
Post by Adarsh Sharma
It returns approx *311 MB* data to the client servers.
root > netstat -p | grep 45355
tcp 0 1531648 localhost:5499 localhost:48848
ESTABLISHED 45355/postgres
Post by Adarsh Sharma
root > strace -p 45355
-- Remain stuck for *2-3 *hours
read(306,
"\30\0kB\3\0\0\0\0\0\377\5\0\0\26\0\0\0\2\0\0\0\30\0+\264\3\0\0\0\0\0"...,
8192) = 8192
Post by Adarsh Sharma
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0
Post by Adarsh Sharma
Then after 2-3 hours it got completed automatically with below
# strace -p 45355
Process 45355 attached - interrupt to quit
sendto(10, "4174\0\0\0\0041535D\0\0\0\30\0\2\0\0\0\006248710\0\0\0"...,
8192, 0, NULL, 0
Post by Adarsh Sharma
) = -1 ECONNRESET (Connection reset by peer)
I'd guess that the receiving program crashes. It might not be able to
stomach the 311 MB that's being sent to it, maybe it's expanding, and
swapping, until it's killed by the OOM killer?
Doesn't look like a postgres problem to me. postgres is stuck trying to
send data... try stracing the client to see what it does? (use "lsof -i
:48848" or whatever the port number of the remote is to find the pid).
--
"Any sufficiently advanced incompetence is indistinguishable from malice."
- Grey's Law
Loading...