使用fluentd收集Kubernetes容器日志

使用fluentd收集kubernetes的容器日志,由elastricsearch存储,并由kibana view. 这里不仅收集业务容器的日志, 同时也收集kuberntes集群组件的日志, 由于业务容器跟集群组件的日志打印格式不一致,因此需要单独使用正则进行处理.

dockerd配置

kubernetes组件本身也是以容器的方式部署

容器本身需要将日志都重定向到标准输出,同时指定dockerd的日志打印格式为json,这个可以全局修改dockerd的启动参数

1
2
3
4
5
6
7
# /etc/docker/daemon.json|grep "log-driver"
{
"log-driver": "json-file"
}
# 日志格式如下:
{"log":"64 bytes from 14.215.177.39: seq=34 ttl=55 time=7.067 ms\r\n","stream":"stdout","time":"2019-05-16T14:14:15.030612567Z"}

所有的容器日志都能在目录**/var/log/containers/*.log**找到

因些fluentd就是检测这个目录下的日志变化,类似于tail -f的机制实时获取新增日志.

fluentd本身也是容器. 配置文件是以configmap的形式存在,如下:

fluentd.conf

fluentd.conf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
kind: ConfigMap
apiVersion: v1
metadata:
name: fluentd-config
namespace: logging
labels:
addonmanager.kubernetes.io/mode: Reconcile
data:
system.conf: |-
<system>
<log>
format json
log_level warn
time_format %Y-%m-%dT%H:%M:%S
</log>
root_dir /tmp/fluentd-buffers/
</system>
containers.input.conf: |-
<source>
@id fluentd-k8s-containers.log
@type tail
path /var/log/containers/*.log
exclude_path ["/var/log/containers/*install-cni*", "/var/log/containers/*rke*"]
pos_file /var/log/fluentd-k8s-containers.log.pos
tag kubernetes.*
<parse>
@type multi_format
<pattern>
format json
time_key time
time_format %Y-%m-%dT%H:%M:%S.%NZ
</pattern>
</parse>
</source>
# Enriches records with Kubernetes metadata
<filter kubernetes.**>
@id filter_kubernetes_metadata
@type kubernetes_metadata
skip_labels true
#skip_container_metadata true
skip_master_url true
skip_namespace_metadata true
</filter>
# Transfer docker logging date to Chinese time. 8H issue.
<filter kubernetes.**>
@type record_transformer
enable_ruby true
<record>
service_name ${record['kubernetes']['container_name']}
docker_stamp ${time.to_i + 3600 * 8}
</record>
</filter>
# Fixes json fields in Elasticsearch
<filter kubernetes.**>
@id k8s_filter_parser
@type parser
key_name log
reserve_data true
remove_key_name_field true
<parse>
@type multi_format
<pattern>
format json
time_key time
keep_time_key
time_format %Y-%m-%dT%H:%M:%S
</pattern>
<pattern>
#k8s-Component
#{"log":"I0710 04:12:31.540733 1 vxlan_network.go:60] watching for new subnet leases\n","stream":"stderr","time":"2019-07-10T04:12:31.540798651Z"}
format /^[A-Z]+(?<logtime>.*)[\s]+(?<request_file>[0-9]+.*)\] (?<msg>.*)$/
</pattern>
<pattern>
#cattle-node-agent
#{"log":"time=\"2019-09-20T18:06:03Z\" level=info msg=\"Starting plan monitor\"\n","stream":"stderr","time":"2019-09-20T18:06:03.149115054Z"}
#format /^time=.{2}(?<time>.*Z).{2} level=(?<level>.*) msg=(?<log>.*)$/
format /^(?<logtime>.*) level=(?<level>.*) msg=(?<msg>.*)$/
</pattern>
<pattern>
#fluentd-container
#{"log":"/var/lib/gems/2.3.0/gems/fluentd-1.6.3/lib/fluent/plugin/parser_regexp.rb:50: warning: regular expression has ']' without escape\n","stream":"stderr", "time": "2019-09-20T18:06:03.149115054Z"}
format /^(?<request_file>.*[0-9]+): (?<level>.*): (?<msg>.*)$/
</pattern>
<pattern>
#nginx-container error log
#2019/10/18 12"00:00 [warn] 123#123: *xxxyyy zzz...
format /^(?<logtime>.*) \[(?<level>.*)\] (?<pid>[0-9]+)#(?<tid>[0-9]+): (?<msg>.*)$/
</pattern>
</parse>
</filter>
# Modify tag to container name
<match kubernetes.**>
@type rewrite_tag_filter
<rule>
key $['kubernetes']['container_name']
pattern ^(.+)$
tag $1
</rule>
</match>
system.input.conf: |-
#Logs from systemd-journal for interesting services.
#TODO(random-liu): Remove this after cri container runtime rolls out.
<source>
@id journald-docker
@type systemd
matches [{ "_SYSTEMD_UNIT": "docker.service" }]
<storage>
@type local
persistent true
path /var/log/journald-docker.pos
</storage>
<entry>
fields_strip_underscores true
fields_lowercase true
</entry>
tag docker
</source>
<source>
@id journald-container-runtime
@type systemd
matches [{ "_SYSTEMD_UNIT": "{{ fluentd_container_runtime_service }}.service" }]
<storage>
@type local
persistent true
path /var/log/journald-container-runtime.pos
</storage>
<entry>
fields_strip_underscores true
fields_lowercase true
</entry>
tag container-runtime
</source>
<source>
@id kernel
@type systemd
matches [{ "_TRANSPORT": "kernel" }]
<storage>
@type local
persistent true
path /var/log/kernel.pos
</storage>
<entry>
fields_strip_underscores true
fields_lowercase true
</entry>
tag kernel
</source>
forward.input.conf: |-
# Takes the messages sent over TCP
<source>
@id forward
@type forward
</source>
monitoring.conf: |-
# Prometheus Exporter Plugin
# input plugin that exports metrics
<source>
@id prometheus
@type prometheus
</source>
<source>
@id monitor_agent
@type monitor_agent
</source>
# input plugin that collects metrics from MonitorAgent
<source>
@id prometheus_monitor
@type prometheus_monitor
<labels>
host ${hostname}
</labels>
</source>
# input plugin that collects metrics for output plugin
<source>
@id prometheus_output_monitor
@type prometheus_output_monitor
<labels>
host ${hostname}
</labels>
</source>
# input plugin that collects metrics for in_tail plugin
<source>
@id prometheus_tail_monitor
@type prometheus_tail_monitor
<labels>
host ${hostname}
</labels>
</source>
output.conf: |-
<match **>
@id elasticsearch
@type elasticsearch
type_name _doc
include_tag_key true
host your-es-cluster-endpoint
port 9200
user your-es-account
password your-es-password
logstash_format true
logstash_prefix your-es-index-prefix.${tag}
request_timeout 30s
<buffer>
@type file
path /var/log/fluentd-buffers/kubernetes.system.buffer
chunk_limit_size 64MB
total_limit_size 32GB
flush_mode interval
retry_type exponential_backoff
flush_thread_count 2
flush_interval 5s
retry_forever
retry_max_interval 30
queue_limit_length 8
overflow_action block
</buffer>
</match>

从上面可以看到,这里不仅收集业务容器的日志, 同时也收集kuberntes集群组件的日志, 由于业务容器跟集群组件的日志虽然都是json格式,但为了更细粒度的进行数据分析,使用正则进行处理.

最后需要修改es的集群地址,用户、密码,索引前缀等信息.

这里没有使用kafka进行缓存, 一来因为使用kafka后,又需要一个logstash进行过度到es,增加了一层,又得维护一层配置,后续增加索引时不是很方便

二来数据量没达到一个量级, 没有kafka,es也能够抗住.

如果需要使用kafka的话,也可以使用kafka-connector机制来直接对接elasticsearch, github上已经有现成的工具, 大家可参考使用, 我这里没用过,

问题总结

在体验的时候,由于各个组件打印的日志格式都不尽相同, 为了接收更多的组件日志定位问题,在使用正则表达式匹配的时候花的时候最长, 同时,大家也可开启fluentd的debug日志,或者将收集到的日志直接打印在本地,对定位问题方便一点,最后切记将debug关掉即可, 不然,磁盘会扛不住

Fluentd本地保存收集日志的配置

1
2
3
4
5
6
7
8
9
#  outputfile.conf: |- 
# <match **>
# @type file
# path /var/log/${tag}.fluentd
# <buffer tag>
# @type file
# path /var/log/xxyy
# </buffer>
# </match>

fluentd开启es的debug, 更多参数可参考这里

遇到一个elasticsearch 索引mapping问题,感兴趣的可参考这里

参考文章: