diff --git a/iac/ansible/roles/ceph_setup/tasks/main.yml b/iac/ansible/roles/ceph_setup/tasks/main.yml index 91f7252..c541a4e 100644 --- a/iac/ansible/roles/ceph_setup/tasks/main.yml +++ b/iac/ansible/roles/ceph_setup/tasks/main.yml @@ -90,4 +90,19 @@ src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/" fstype: ceph opts: "name=admin,secret={{ ceph_admin_key.stdout }}" - state: mounted \ No newline at end of file + state: mounted + +# Metriken aktivieren +- name: CEPH | Prüfen, ob Prometheus Modul bereits aktiv ist + ansible.builtin.command: "ceph mgr module ls --format json" + register: ceph_modules_status + changed_when: false + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | Prometheus Modul aktivieren + ansible.builtin.command: "ceph mgr module enable prometheus" + # Wir prüfen im JSON-Output, ob 'prometheus' in der Liste 'enabled_modules' fehlt + when: "'prometheus' not in (ceph_modules_status.stdout | from_json).enabled_modules" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true \ No newline at end of file diff --git a/iac/ansible/roles/hyperdx/files/config.xml b/iac/ansible/roles/hyperdx/files/config.xml new file mode 100644 index 0000000..84eca4b --- /dev/null +++ b/iac/ansible/roles/hyperdx/files/config.xml @@ -0,0 +1,175 @@ + + + + debug + true + + + + + 0.0.0.0 + 8123 + 9000 + ch-server + 9009 + + 4096 + 64 + 100 + 8589934592 + 5368709120 + + /var/lib/clickhouse/ + /var/lib/clickhouse/tmp/ + /var/lib/clickhouse/user_files/ + + + + users.xml + + + + default + default + UTC + false + + + + /metrics + 9363 + true + true + true + true + + + + + system + query_log
+ 7500 +
+ + + + system + metric_log
+ 7500 + 1000 +
+ + + + system + asynchronous_metric_log
+ + 7000 +
+ + + + + + engine MergeTree + partition by toYYYYMM(finish_date) + order by (finish_date, finish_time_us, trace_id) + + system + opentelemetry_span_log
+ 7500 +
+ + + + + system + crash_log
+ + + 1000 +
+ + + + system + processors_profile_log
+ + toYYYYMM(event_date) + 7500 +
+ + + + system + part_log
+ toYYYYMM(event_date) + 7500 +
+ + + + system + trace_log
+ + toYYYYMM(event_date) + 7500 +
+ + + + system + query_thread_log
+ toYYYYMM(event_date) + 7500 +
+ + + + system + query_views_log
+ toYYYYMM(event_date) + 7500 +
+ + + + + + ch-server + 9000 + + + + + + + /clickhouse/task_queue/ddl + + + /var/lib/clickhouse/format_schemas/ +
diff --git a/iac/ansible/roles/hyperdx/files/users.xml b/iac/ansible/roles/hyperdx/files/users.xml new file mode 100644 index 0000000..80c8d4f --- /dev/null +++ b/iac/ansible/roles/hyperdx/files/users.xml @@ -0,0 +1,51 @@ + + + + + 10000000000 + 0 + in_order + 1 + + + + + + + default + + ::/0 + + default + + + api + default + + ::/0 + + default + + + worker + default + + ::/0 + + default + + + + + + + 3600 + 0 + 0 + 0 + 0 + 0 + + + + diff --git a/iac/ansible/roles/hyperdx/tasks/main.yml b/iac/ansible/roles/hyperdx/tasks/main.yml new file mode 100644 index 0000000..6bd89d5 --- /dev/null +++ b/iac/ansible/roles/hyperdx/tasks/main.yml @@ -0,0 +1,66 @@ +--- +- name: HYPERDX | Verzeichnisse erstellen + ansible.builtin.file: + path: "{{ data_dir }}/{{ item.path }}" + state: directory + owner: "{{ item.uid }}" + group: "{{ item.gid }}" + mode: '0755' + recurse: no + loop: + - { path: 'mongo', uid: 999, gid: 999 } # MongoDB Standard + - { path: 'clickhouse/data', uid: 101, gid: 101 } # ClickHouse Standard + - { path: 'clickhouse/logs', uid: 101, gid: 101 } + - { path: 'clickhouse/config', uid: 101, gid: 101 } + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: HYPERDX | ClickHouse Konfiguration kopieren + ansible.builtin.copy: + src: "{{ item }}" + dest: "/mnt/cephfs/hyperdx/clickhouse/config/" + owner: 101 + group: 101 + mode: '0644' + loop: + - files/config.xml # Lokal in deinem Ansible Repo + - files/users.xml + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: HYPERDX | shared-observability Netzwerk erstellen + community.docker.docker_network: + name: shared-observability + driver: overlay + state: present + attachable: yes + ipam_config: + - subnet: '172.16.116.0/24' + gateway: '172.16.116.1' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: HYPERDX | OTel Collector Config generieren + ansible.builtin.template: + src: otel-collector-config.yaml.j2 + dest: "{{ data_dir }}/data/otel-collector-config.yaml" + mode: '0644' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: HYPERDX | Generate Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: '{{ data_dir }}/hyperdx.yml' + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: HYPERDX | Deploy stack + community.docker.docker_stack: + state: present + name: hyperdx + compose: + - '{{ data_dir }}/hyperdx.yml' + delegate_to: "{{ groups['managers'][0] }}" + run_once: true diff --git a/iac/ansible/roles/hyperdx/templates/docker-compose.yml.j2 b/iac/ansible/roles/hyperdx/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..bbfb229 --- /dev/null +++ b/iac/ansible/roles/hyperdx/templates/docker-compose.yml.j2 @@ -0,0 +1,99 @@ +version: '3.9' + +services: + db: + image: mongo:5.0.14-focal + volumes: + - "{{ data_dir }}/mongo:/data/db" + networks: + - internal + deploy: + mode: replicated + replicas: 1 + # placement: + # constraints: [node.role == worker] # DBs besser auf Workern lassen wenn möglich + + otel-collector: + image: "clickhouse/clickstack-otel-collector:2" + environment: + CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s' + HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE: "default" + HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}" + OPAMP_SERVER_URL: 'http://app:{{ hyperdx_opamp_port | default(4320) }}' + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + # - "8888:8888" # Metrics (optional) + networks: + - internal + - shared-observability + - traefik_public + deploy: + mode: replicated + replicas: 3 + labels: + - "traefik.enable=true" + - "traefik.docker.network=traefik_public" + - "traefik.http.routers.otel-collector.rule=Host(`{{ otlp_domain }}`)" + - "traefik.http.routers.otel-collector.entrypoints=https" + - "traefik.http.routers.otel-collector.tls.certresolver=main" + - "traefik.http.services.otel-collector.loadbalancer.server.port=4318" + + app: + image: "hyperdx/hyperdx:2" + environment: + # URLs anpassen für Traefik Erreichbarkeit + FRONTEND_URL: "https://{{ hdx_domain }}" + HYPERDX_APP_URL: "https://{{ hdx_domain }}" + + HYPERDX_API_KEY: "{{ hyperdx_api_key }}" + HYPERDX_API_PORT: "{{ hyperdx_api_port | default(8000) }}" + HYPERDX_APP_PORT: "{{ hyperdx_app_port | default(8080) }}" + HYPERDX_LOG_LEVEL: "{{ hyperdx_log_level | default('info') }}" + MINER_API_URL: 'http://miner:5123' # Falls miner benötigt wird (in original compose nicht definiert?) + MONGO_URI: 'mongodb://db:27017/hyperdx' + SERVER_URL: "http://127.0.0.1:{{ hyperdx_api_port | default(8000) }}" + OPAMP_PORT: "{{ hyperdx_opamp_port | default(4320) }}" + OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318' + OTEL_SERVICE_NAME: 'hdx-oss-app' + USAGE_STATS_ENABLED: "{{ usage_stats_enabled | default('false') }}" + # Clickhouse Connection String (Default User/Pass from Clickhouse Image) + DEFAULT_CONNECTIONS: >- + [{"name":"Local ClickHouse","host":"http://ch-server:8123","username":"default","password":""}] + DEFAULT_SOURCES: '{{ hyperdx_default_sources | to_json }}' + networks: + - internal + - traefik_public + deploy: + labels: + - "traefik.enable=true" + - "traefik.docker.network=traefik_public" + - "traefik.http.routers.hyperdx.rule=Host(`{{ subdomain }}.{{ main_domain }}`)" + - "traefik.http.routers.hyperdx.entrypoints=https" + - "traefik.http.routers.hyperdx.tls.certresolver=main" + - "traefik.http.services.hyperdx.loadbalancer.server.port={{ hyperdx_app_port | default(8080) }}" + + ch-server: + image: clickhouse/clickhouse-server:25.6-alpine + environment: + CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1 + volumes: + - "{{ data_dir }}/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml" + - "{{ data_dir }}/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml" + - "{{ data_dir }}/clickhouse/data:/var/lib/clickhouse" + - "{{ data_dir }}/clickhouse/logs:/var/log/clickhouse-server" + deploy: + mode: replicated + replicas: 1 + # placement: + # constraints: [node.role == worker] + networks: + - internal + +networks: + internal: + driver: overlay + traefik_public: + external: true + shared-observability: + external: true \ No newline at end of file diff --git a/iac/ansible/roles/hyperdx/vars/main.yml b/iac/ansible/roles/hyperdx/vars/main.yml new file mode 100644 index 0000000..bee7321 --- /dev/null +++ b/iac/ansible/roles/hyperdx/vars/main.yml @@ -0,0 +1,103 @@ +data_dir: "{{ ceph_volume }}/hyperdx" +subdomain: "hdx" + +hdx_domain: "{{ subdomain }}.{{ main_domain }}" +otlp_domain: "otlp.{{ main_domain }}" + +# Generiere einen sicheren Key: `openssl rand -hex 16` +hyperdx_api_key: "" +hyperdx_api_port: 8000 +hyperdx_app_port: 8080 +hyperdx_log_level: "info" +hyperdx_opamp_port: 4320 + +usage_stats_enabled: "false" + +# Definition der Datenquellen für das Frontend +hyperdx_default_sources: + - name: "Logs" + kind: "log" + from: + databaseName: "default" + tableName: "otel_logs" + timestampValueExpression: "TimestampTime" + displayedTimestampValueExpression: "Timestamp" + implicitColumnExpression: "Body" + serviceNameExpression: "ServiceName" + bodyExpression: "Body" + eventAttributesExpression: "LogAttributes" + resourceAttributesExpression: "ResourceAttributes" + defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body" + severityTextExpression: "SeverityText" + traceIdExpression: "TraceId" + spanIdExpression: "SpanId" + connection: "Local ClickHouse" + traceSourceId: "Traces" + sessionSourceId: "Sessions" + metricSourceId: "Metrics" + + - name: "Traces" + kind: "trace" + from: + databaseName: "default" + tableName: "otel_traces" + timestampValueExpression: "Timestamp" + displayedTimestampValueExpression: "Timestamp" + implicitColumnExpression: "SpanName" + serviceNameExpression: "ServiceName" + bodyExpression: "SpanName" + eventAttributesExpression: "SpanAttributes" + resourceAttributesExpression: "ResourceAttributes" + defaultTableSelectExpression: "Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName" + traceIdExpression: "TraceId" + spanIdExpression: "SpanId" + durationExpression: "Duration" + durationPrecision: 9 + parentSpanIdExpression: "ParentSpanId" + spanNameExpression: "SpanName" + spanKindExpression: "SpanKind" + statusCodeExpression: "StatusCode" + statusMessageExpression: "StatusMessage" + connection: "Local ClickHouse" + logSourceId: "Logs" + sessionSourceId: "Sessions" + metricSourceId: "Metrics" + + - name: "Metrics" + kind: "metric" + from: + databaseName: "default" + tableName: "" + timestampValueExpression: "TimeUnix" + resourceAttributesExpression: "ResourceAttributes" + metricTables: + gauge: "otel_metrics_gauge" + histogram: "otel_metrics_histogram" + sum: "otel_metrics_sum" + _id: "682586a8b1f81924e628e808" + id: "682586a8b1f81924e628e808" + connection: "Local ClickHouse" + logSourceId: "Logs" + traceSourceId: "Traces" + sessionSourceId: "Sessions" + + - name: "Sessions" + kind: "session" + from: + databaseName: "default" + tableName: "hyperdx_sessions" + timestampValueExpression: "TimestampTime" + displayedTimestampValueExpression: "Timestamp" + implicitColumnExpression: "Body" + serviceNameExpression: "ServiceName" + bodyExpression: "Body" + eventAttributesExpression: "LogAttributes" + resourceAttributesExpression: "ResourceAttributes" + defaultTableSelectExpression: "Timestamp,ServiceName,SeverityText,Body" + severityTextExpression: "SeverityText" + traceIdExpression: "TraceId" + spanIdExpression: "SpanId" + connection: "Local ClickHouse" + logSourceId: "Logs" + traceSourceId: "Traces" + metricSourceId: "Metrics" \ No newline at end of file diff --git a/iac/ansible/roles/infra-monitoring/defaults/main.yml b/iac/ansible/roles/infra-monitoring/defaults/main.yml new file mode 100644 index 0000000..e69de29 diff --git a/iac/ansible/roles/infra-monitoring/tasks/main.yml b/iac/ansible/roles/infra-monitoring/tasks/main.yml new file mode 100644 index 0000000..17fe4d9 --- /dev/null +++ b/iac/ansible/roles/infra-monitoring/tasks/main.yml @@ -0,0 +1,32 @@ + +- name: MONITORING | Ensure data directories + file: + path: "{{ data_dir }}/data" + state: directory + mode: '0755' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: MONITORING | Config generieren + template: + src: otel-agent-config.yaml.j2 + dest: "{{ data_dir }}/otel-agent-config.yaml" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: MONITORING | Compose generieren + template: + src: docker-compose.yml.j2 + dest: "{{ data_dir }}/monitoring.yml" + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: MONITORING | Stack deployen + community.docker.docker_stack: + state: present + name: infra-monitoring + compose: + - "{{ data_dir }}/monitoring.yml" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true \ No newline at end of file diff --git a/iac/ansible/roles/infra-monitoring/templates/docker-compose.yml.j2 b/iac/ansible/roles/infra-monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..3600619 --- /dev/null +++ b/iac/ansible/roles/infra-monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,38 @@ +version: '3.9' + +services: + otel-agent: + image: otel/opentelemetry-collector-contrib:0.143.0 + user: "0:0" # Root für Hardware-Zugriff + command: ["--config=/etc/otel-agent-config.yaml"] + security_opt: + - apparmor:unconfined + volumes: + - {{ data_dir }}/otel-agent-config.yaml:/etc/otel-agent-config.yaml + - /:/hostfs:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /sys:/hostfs/sys:ro + - /proc:/hostfs/proc:ro + environment: + - GOMEMLIMIT=180MiB + - HOST_PROC=/hostfs/proc + - HOST_SYS=/hostfs/sys + - HOST_ETC=/hostfs/etc + - HOST_VAR=/hostfs/var + - HOST_RUN=/hostfs/run + - HOST_DEV=/hostfs/dev + deploy: + mode: global + update_config: + parallelism: 1 + delay: 10s + resources: + limits: + memory: 200M + networks: + - host + +networks: + host: + name: host + external: true \ No newline at end of file diff --git a/iac/ansible/roles/infra-monitoring/templates/otel-agent-config.yaml.j2 b/iac/ansible/roles/infra-monitoring/templates/otel-agent-config.yaml.j2 new file mode 100644 index 0000000..23a4fdb --- /dev/null +++ b/iac/ansible/roles/infra-monitoring/templates/otel-agent-config.yaml.j2 @@ -0,0 +1,117 @@ +extensions: + # Beobachtet Docker Container + docker_observer: + endpoint: "unix:///var/run/docker.sock" + cache_sync_interval: 30s + +receivers: + hostmetrics: + root_path: /hostfs + collection_interval: 15s + scrapers: + cpu: + metrics: + system.cpu.time: + enabled: true + system.cpu.utilization: + enabled: true + memory: + metrics: + system.memory.usage: + enabled: true + system.memory.utilization: + enabled: true + filesystem: + metrics: + system.filesystem.usage: + enabled: true + system.filesystem.utilization: + enabled: true + paging: + metrics: + system.paging.usage: + enabled: true + system.paging.utilization: + enabled: true + system.paging.faults: + enabled: true + load: + disk: + network: + + docker_stats: + endpoint: unix:///var/run/docker.sock + collection_interval: 30s + timeout: 20s + + # receiver_creator: + # watch_observers: [docker_observer] + # receivers: + # filelog: + # rule: type == "container" # Nur für Container + # config: + # include: + # - /hostfs/var/lib/docker/containers/*/*.log + # operators: + # - type: container + # format: docker + # add_metadata_from_filepath: true + # - type: json_parser + # timestamp: + # parse_from: time + # layout: '%Y-%m-%dT%H:%M:%S.%LZ' + # severity: + # parse_from: stream + # mapping: + # info: stdout + # error: stderr + + # Ceph Scraping (Funktioniert nur auf Nodes, wo Ceph Mgr läuft) + prometheus: + config: + scrape_configs: + - job_name: 'ceph-local' + scrape_interval: 30s + scrape_timeout: 10s + static_configs: + - targets: ['127.0.0.1:9283'] + metric_relabel_configs: + - source_labels: [__name__] + regex: 'ceph_cluster_total_.*|ceph_health_status|ceph_osd_.*|ceph_pool_.*' + action: keep + +processors: + batch: + timeout: 5s + + resourcedetection: + detectors: [env, system] + + resourcedetection/docker: + detectors: [env, docker] + timeout: 2s + override: false + +exporters: + debug: + verbosity: detailed + otlp: + endpoint: "127.0.0.1:4317" + headers: + authorization: {{ hyperdx_api_ingestion_key }} + compression: gzip + tls: + insecure: true + +service: + extensions: [docker_observer] + pipelines: + metrics: + receivers: [hostmetrics, docker_stats, prometheus] + # receivers: [hostmetrics] + processors: [resourcedetection, batch] + exporters: [otlp] + # logs: + # receivers: [receiver_creator] + # processors: [resourcedetection/docker, batch] + # exporters: [otlp, debug] \ No newline at end of file diff --git a/iac/ansible/roles/infra-monitoring/vars/main.yml b/iac/ansible/roles/infra-monitoring/vars/main.yml new file mode 100644 index 0000000..185c87c --- /dev/null +++ b/iac/ansible/roles/infra-monitoring/vars/main.yml @@ -0,0 +1 @@ +data_dir: "{{ ceph_volume }}/infra-monitoring" \ No newline at end of file diff --git a/iac/ansible/roles/monitoring/tasks/main.yml b/iac/ansible/roles/monitoring/tasks/main.yml deleted file mode 100644 index 45befbc..0000000 --- a/iac/ansible/roles/monitoring/tasks/main.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -- name: Copy Stack Files - copy: - directory_mode: true - src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring - dest: /srv -- block: - - name: Deploy Monitoring stack - community.docker.docker_stack: - state: present - name: monitoring - compose: - - /srv/monitoring/observability.yml diff --git a/iac/ansible/roles/ufw_firewall/tasks/main.yml b/iac/ansible/roles/ufw_firewall/tasks/main.yml index e992ebb..60616f4 100644 --- a/iac/ansible/roles/ufw_firewall/tasks/main.yml +++ b/iac/ansible/roles/ufw_firewall/tasks/main.yml @@ -59,6 +59,14 @@ interface: "{{ private_interface }}" direction: in +- name: FIREWALL | Ceph Prometheus Exporter auf privatem Interface erlauben + community.general.ufw: + rule: allow + port: "9283" + proto: tcp + interface: "{{ private_interface }}" + direction: in + - name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben community.general.ufw: rule: allow