add plattform services

This commit is contained in:
Marcel Arndt 2026-01-05 16:01:40 +01:00
parent e25d8dd5d9
commit 193319fa52
65 changed files with 4240 additions and 39 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
.local/
leantime/
.vscode/
.DS_Store

35
Readme.md Normal file
View File

@ -0,0 +1,35 @@
██████╗ ███████╗███╗ ██╗██╗██╗ ██╗ ██████╗ ██████╗ ███████╗ ██████╗
██╔════╝ ██╔════╝████╗ ██║██║██║ ██║██╔════╝ ██╔════╝ ██╔════╝██╔═══██╗
██║ ███╗█████╗ ██╔██╗ ██║██║██║ ██║╚█████╗ ██║ █████╗ ██║ ██║
██║ ██║██╔══╝ ██║╚██╗██║██║██║ ██║ ╚═══██╗ ██║ ██╔══╝ ██║ ██║
╚██████╔╝███████╗██║ ╚████║██║╚█████╔╝██████╔╝ ╚██████╗ ███████╗╚██████╔╝
╚═════╝ ╚══════╝╚═╝ ╚═══╝╚═╝ ╚════╝ ╚═════╝ ██ ╚═════╝ ╚══════╝ ╚═════╝
---
# Genius.ceo repository
Ceph Dashboard is now available at:
URL: https://manager-node-1:8443/
User: admin
Password: g0uhtgv520
Enabling client.admin keyring and conf on hosts with "admin" label
Saving cluster configuration to /var/lib/ceph/6fb9d55b-5b20-11f0-9be6-920006043bcc/config directory
You can access the Ceph CLI as following in case of multi-cluster or non-default config:
sudo /usr/sbin/cephadm shell --fsid 6fb9d55b-5b20-11f0-9be6-920006043bcc -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring
Or, if you are only running a single cluster on this host:
sudo /usr/sbin/cephadm shell
Please consider enabling telemetry to help improve Ceph:
ceph telemetry on
For more information see:
https://docs.ceph.com/en/latest/mgr/telemetry/
Bootstrap complete.

View File

@ -0,0 +1,33 @@
- name: Nodes initialisieren und härten
hosts: all
become: true
roles:
- role: common
tags: common
- role: ssh_hardening
tags: ssh
- role: ufw_firewall
tags: firewall
- role: fail2ban
tags: fail2ban
handlers:
- name: restart sshd
ansible.builtin.service:
name: ssh
state: restarted
- name: restart fail2ban
ansible.builtin.service:
name: fail2ban
state: restarted
- name: Setup Ceph Cluster and CephFS
hosts: all
become: true
roles:
- role: ceph_setup
- name: Docker Swarm initialisieren
hosts: all
become: true
roles:
- role: docker_swarm

View File

@ -0,0 +1,9 @@
- name: Infrastruktur Dienste bereitstellen
hosts: all
gather_facts: true
roles:
- traefik
- authentik
- portainer
- leantime
- kestra

View File

@ -0,0 +1,4 @@
APT::Periodic::Update-Package-Lists "1";
APT::Periodic::Download-Upgradeable-Packages "1";
APT::Periodic::AutocleanInterval "7";
APT::Periodic::Unattended-Upgrade "1";

View File

@ -0,0 +1,9 @@
Unattended-Upgrade::Allowed-Origins {
"${distro_id}:${distro_codename}-security";
};
Unattended-Upgrade::Package-Blacklist {
};
Unattended-Upgrade::DevRelease "false";
Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
Unattended-Upgrade::Remove-Unused-Dependencies "true";
Unattended-Upgrade::Automatic-Reboot "false";

View File

@ -0,0 +1,12 @@
admin_user: 'admin'
ssh_port: 22
cephfs_name: "shared-fs"
ceph_osd_device: "/dev/sdb"
public_interface: 'eth0'
private_interface: 'enp7s0'
authorized_keys:
- 'ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKtYTptTN9ggoy0aUKXmxaPKpclEn86jM7s5UtTw1JJI' # Marcels MacBook
main_domain: genius.ceo
ceph_volume: /mnt/cephfs
traefik_public_net: traefik_public

View File

@ -0,0 +1,59 @@
$ANSIBLE_VAULT;1.1;AES256
34613362353964313436306439386661613364663666653265313937343239633365663836653030
6262386661666364383961336461316139333262623034340a643434316632336132613264646437
36376365613061353866383135353432303433353931633063313566613166303064316666613132
3733623536643935370a656431646435626265666265666230356162656363663838636662313466
61336237306332643032653766313036636163336431613236663864636438363832383231323362
62666463336639303766356331353635323031636465616235663738333761653934346663386636
63623361363164663663313966653939643462353638613464396466613931363662623763326535
34376237353663656636363866373466346434666339646131396439653261373738636665613435
65356330303863303236373933333163633964633061393136646632386137346434353365343763
30343937656166303962653030366566616331666262343336343138623566353832313836643435
62636333346235316562303061656166383135633464623734626336623565346336626134333933
35363363376663333061663164623539363731613263376163306436636265336562396439356137
30663431373131303437393166396539306133636264653733303762316363386438643536306338
32303139303363316264393939326561393730396664343361393863303736343933636265633439
65633765666362396439643863653531363366383866373939616333353430633530343262366138
31663863663165653932653733623761613265383039336633383832393761666337336165613933
63383934366662353038626539633132313939376231643133363739303235326433353733363437
35626233613936626532326262646166363739666162353237323237383132333134343439336134
33613462393237626432386462373439303439356666336630363536366233346438313039346530
33393232333633663731393466653439623638316565346530306439326431323436356166633334
66383034643834613133333265646338303463393035393266653832366434313636633730636436
38353337633437656262623061666563646637626363353561323231376237623264373861376666
66363265633638356133353933613664353934373634613662326437336562663766306364303538
35623130616265623838353838396235386661666132623163383162373665313462663738303933
63363764653561616162386139646130393439373066666437623236383238396233653165623032
34316439376331356539626464313462616238623166623761626435303565653233386236656262
62613935336661623862323833353265366533643830373634663266666332333463303666343366
39653332346433306566316430656361363230343761613263393230366362363132663565636264
65313633653464663963373561373532636235353331353237623635613034613337343730656632
31656165666134333864353730363163623365393030333932393565666235643639303662663532
38343734393135643039633664653966313536616533656635373535636434396333313536623536
39623132326362656166366566373163386363336231633233353639313166333932656133363365
66666665346331613638656562396463386637356539366539343232353061666531353166396536
39623762633064323332653831643832303332396431633738396266633935656132323164613161
61353663383532613763356630373063383161376165333736316466353231656534366636313636
37616636383163616136643630363535346137636636633432643337393865393063626663333164
36656537343231386333323637386539386364356266376433616636313239376666353066306363
39376461323062393935613630656230346131373634363633393035346263663762623063356633
36646664623230303761373138333164303363373365386266386138653764623030623630333631
66363866633064656532336137613964653431663436333761666631656339646161636435343065
37646164653937633962386631373236653064346438323664383933643738656536356562626532
34663834363230303164626236393938643037363036613965373330636238633661346335336531
62663461626365386362393061626266303463663735303539383937363965383234666337386165
30366564363766623162306666656566353662633866396430396633623266383332303339666663
38313536666336323366616432336161656434646463373963356331326364333038366337386638
39396535386331663466323334613533383439343437363631363532313362663564353635343735
37653063383163316366366335663537653134326564643062653065303337303333643961383837
39393734326562616165313133643766303934336263326433366436623539633233643761616436
33356234313538343635343630623337343436346638396539316131623861353630333964633839
33316565326164386337623730623932313363306436316335336238333430626165663232343463
36653038633632616335393262656638346434386639383131396233643932323931393264613134
30336134343464373265636234656561653462356435383138323638613039623839373935326462
32393430616438356332313766353337383035623137363233323664393833303464313162303833
65383131313335353832343963636639346162353634306430353638393136623734623833306136
32396130623065326636633235346630336435663261353866323862666231656261333839373162
35623835663434356438653533623337363531353634663064303035633839656463656238636132
66316333356633613130323438376530623634336632323365616239373865623334363635396331
3263616336653336636666386632316564613331323431363935

10
iac/ansible/inventory.ini Normal file
View File

@ -0,0 +1,10 @@
[all:children]
managers
workers
[managers]
manager-node-1 ansible_host=37.27.215.220 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key
manager-node-2 ansible_host=135.181.146.55 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key
manager-node-3 ansible_host=65.109.135.85 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key
[workers]

10
iac/ansible/playbook.yml Normal file
View File

@ -0,0 +1,10 @@
---
- name: Main-Playbook
hosts: all
gather_facts: true
roles:
# - traefik
# - portainer
# - kestra
- gitea

View File

@ -0,0 +1,33 @@
networks:
traefik_public:
external: true
services:
dockge:
image: louislam/dockge:1
environment:
- DOCKGE_STACKS_DIR=/opt/stacks
- DOCKGE_DATA_DIR=/app/data
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- '/mnt/cephfs/dockge/data:/app/data'
- '/mnt/cephfs/dockge/stacks:/opt/stacks'
networks:
- traefik_public
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
restart_policy:
condition: on-failure
labels:
- 'traefik.enable=true'
- 'traefik.swarm.network=traefik_public'
# --- Router für Dockge ---
- 'traefik.http.routers.dockge.rule=Host(`dockge.genius.ceo`)'
- 'traefik.http.routers.dockge.entrypoints=https'
- 'traefik.http.routers.dockge.tls.certresolver=main'
# --- Service für Dockge ---
- 'traefik.http.services.dockge.loadbalancer.server.port=5001'

View File

@ -0,0 +1,22 @@
services:
sd_server:
image: socheatsok78/dockerswarm_sd_server:latest
networks:
- sd_network
volumes:
- /var/run/docker.sock:/var/run/docker.sock
networks:
monitoring:
driver: overlay
attachable: true
ipam:
config:
- subnet: 172.16.201.0/24
sd_network:
driver: overlay
attachable: true
ipam:
config:
- subnet: 172.16.202.0/24

View File

@ -0,0 +1,139 @@
faro.receiver "stage_app_agent_receiver" {
server {
listen_address = "0.0.0.0"
listen_port = 12347
cors_allowed_origins = ["*"]
// cors_allowed_origins = ["https://avicenna.genius.ceo"]
api_key = "t3stK3y"
max_allowed_payload_size = "10MiB"
rate_limiting {
rate = 100
}
}
sourcemaps {}
output {
logs = [loki.process.logs_process_client.receiver]
traces = [otelcol.exporter.otlp.tempo.input]
}
}
loki.process "logs_process_client" {
forward_to = [loki.write.to_loki.receiver]
stage.logfmt {
mapping = { "kind" = "", "service_name" = "", "app_name" = "", "namespace" = "" }
}
stage.labels {
values = { "kind" = "kind", "service_name" = "service_name", "app" = "app_name", "namespace" = "namespace" }
}
}
otelcol.receiver.otlp "otel_collector" {
grpc {
endpoint = "0.0.0.0:4317"
}
http {
endpoint = "0.0.0.0:4318"
cors {
allowed_origins = ["https://avicenna.genius.ceo/"]
}
}
// Definiert, wohin die empfangenen Daten weitergeleitet werden
output {
metrics = [otelcol.exporter.prometheus.otel_metrics.input]
logs = [otelcol.exporter.loki.otel_logs.input]
traces = [otelcol.exporter.otlp.tempo.input]
}
}
loki.write "to_loki" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}
prometheus.remote_write "to_prometheus" {
endpoint {
url = "http://prometheus:9090/api/v1/write"
}
}
// Docker-Container auf dem Host entdecken
discovery.docker "logs_integration_docker" {
host = "unix:///var/run/docker.sock"
refresh_interval = "5s"
}
discovery.relabel "logs_integration_docker" {
targets = []
rule {
action = "labelmap"
regex = "__meta_docker_container_label_com_docker_swarm_node_id"
replacement = "node_id"
}
rule {
action = "labelmap"
regex = "__meta_docker_container_label_com_docker_stack_namespace"
replacement = "namespace"
}
rule {
action = "labelmap"
regex = "__meta_docker_container_label_com_docker_swarm_service_name"
replacement = "service_name"
}
rule {
action = "labelmap"
regex = "__meta_docker_container_name"
replacement = "container_name"
}
}
loki.source.docker "logs_from_containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.logs_integration_docker.targets // Nutzt die entdeckten Container
relabel_rules = discovery.relabel.logs_integration_docker.rules
// Leitet die gesammelten Logs an den definierten Loki-Endpunkt weiter
forward_to = [loki.write.to_loki.receiver]
}
otelcol.exporter.otlp "tempo" { // Name kann variieren
client {
endpoint = "tempo:4317" // Ziel: Tempo Service auf Port 4317
tls {
insecure = true // Interne Kommunikation ohne TLS
}
}
}
otelcol.exporter.prometheus "otel_metrics" {
forward_to = [prometheus.remote_write.to_prometheus.receiver]
}
otelcol.exporter.loki "otel_logs" {
forward_to = [loki.write.to_loki.receiver]
}
// Logging für Alloy selbst konfigurieren
logging {
level = "info"
format = "logfmt"
}
// prometheus.scrape "alloy_self" {
// targets = [
// prometheus.target_group {
// targets = [{"__address__" = "localhost:12345"}]
// }
// ]
// forward_to = [...] // An Prometheus Remote Write oder lokalen Agent
// }

View File

@ -0,0 +1,76 @@
auth_enabled: false # Einfachste Konfiguration ohne Authentifizierung
analytics:
reporting_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096 # Standard gRPC Port für Loki
common:
instance_addr: 127.0.0.1 # Adresse, unter der sich die Instanz meldet
path_prefix: /loki # Wo Loki seine Daten speichert (im Volume)
storage:
filesystem: # Lokales Dateisystem für Indizes und Chunks
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1 # Keine Replikation bei Einzelinstanz
ring:
kvstore:
store: inmemory # Einfachster Ring-Speicher für Einzelinstanz
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
pattern_ingester:
enabled: true
metric_aggregation:
loki_address: localhost:3100
frontend:
encoding: protobuf
limits_config:
metric_aggregation_enabled: true
reject_old_samples: true
reject_old_samples_max_age: 168h # 7 Tage
ingestion_rate_mb: 15 # Erlaube 15 MiB/Sekunde pro Tenant (Standard war 4)
ingestion_burst_size_mb: 30 # Erlaube kurzfristige Bursts bis 30 MiB (Standard war 6)
# Optional: Maximale Anzahl aktiver Log-Streams pro Tenant (Standard ist 10000)
# max_global_streams_per_user: 10000
# Optional: Maximale Größe einer Log-Zeile (Standard 256kB)
# max_line_size: 262144
# --- Optional: Compactor (Bereinigt alte Daten) ---
# compactor:
# working_directory: /loki/compactor
# shared_store: filesystem
# compaction_interval: 10m
# retention_enabled: true
# retention_delete_delay: 2h
# retention_delete_worker_count: 150
# --- Optional: Ruler (für Alerts basierend auf Logs) ---
# ruler:
# alertmanager_url: http://alertmanager:9093 # Pfad zu deinem Alertmanager
# storage:
# type: local
# local:
# directory: /loki/rules
# rule_path: /tmp/loki/rules-temp
# ring:
# kvstore:
# store: inmemory
# enable_api: true

View File

@ -0,0 +1,57 @@
global:
scrape_interval: 15s # Wie oft Ziele abgefragt werden
evaluation_interval: 15s # Wie oft Regeln ausgewertet werden
scrape_configs:
- job_name: 'prometheus'
# Prometheus überwacht sich selbst
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
# Docker Swarm Service Discovery für den Node Exporter
dockerswarm_sd_configs:
- host: unix:///var/run/docker.sock
role: tasks
port: 9100 # Standard-Port vom Node Exporter
relabel_configs:
# Nur Tasks im Zustand 'running' verwenden
- source_labels: [__meta_dockerswarm_task_desired_state]
regex: running
action: keep
# Nur Tasks des 'node-exporter' Services aus diesem Stack auswählen
# Passe den Regex ggf. an, wenn dein Stack anders heißt (hier Annahme: Stack-Name enthält 'monitoring')
- source_labels: [__meta_dockerswarm_service_name]
regex: ^monitoring_node-exporter$ # Regex an Stack-Namen anpassen!
action: keep
# Verwende den Hostnamen des Swarm Nodes als Instance Label
- source_labels: [__meta_dockerswarm_node_hostname]
target_label: instance
# Setze die Zieladresse korrekt auf IP:Port
- source_labels: [__address__]
regex: '(.*):.*' # Extrahiere die IP-Adresse
replacement: '${1}:9100' # Setze den korrekten Port (9100)
target_label: __address__
- job_name: 'cadvisor'
dockerswarm_sd_configs:
- host: unix:///var/run/docker.sock
role: tasks
port: 8080 # Standard-Port von cAdvisor
relabel_configs:
# Nur Tasks im Zustand 'running' verwenden
- source_labels: [__meta_dockerswarm_task_desired_state]
regex: running
action: keep
# Nur Tasks des 'cadvisor' Services aus diesem Stack auswählen
# Passe den Regex an deinen Stack-Namen an!
- source_labels: [__meta_dockerswarm_service_name]
regex: .*(monitoring|mon)_cadvisor.* # Regex an Stack-Namen anpassen!
action: keep
# Verwende den Hostnamen des Swarm Nodes als Instance Label
- source_labels: [__meta_dockerswarm_node_hostname]
target_label: instance
# WICHTIG: Setze den Metrik-Pfad, da cAdvisor ihn unter /metrics bereitstellt
- action: replace
target_label: __metrics_path__
replacement: /metrics

View File

@ -0,0 +1,38 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /mnt/promtail/positions.yaml # Pfad im gemounteten Volume
clients:
- url: http://loki:3100/loki/api/v1/push # Sendet Logs an den Loki-Service
scrape_configs:
- job_name: docker_containers
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
# Extrahiere Container-Name (ohne '/')
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container_name'
# Behalte den Log-Stream (stdout/stderr) als Label
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'logstream'
# Extrahiere Service-Name aus Swarm-Label
- source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
target_label: 'service_name'
# Extrahiere Task-Name aus Swarm-Label
- source_labels: ['__meta_docker_container_label_com_docker_swarm_task_name']
target_label: 'task_name'
# Füge 'instance'-Label mit dem Hostnamen des Tasks hinzu (Annäherung an Node-Namen)
- action: replace
source_labels: ['container_name'] # Braucht ein existierendes Label als Quelle
target_label: 'instance'
replacement: ${HOSTNAME} # Nutzt Swarm HOSTNAME Variable
# Verwerfe Logs von Promtail selbst (Regex ggf. an Stacknamen anpassen)
- source_labels: ['container_name']
regex: 'monitoring_promtail.*' # Passe 'monitoring' an deinen Stack-Namen an!
action: drop

View File

@ -0,0 +1,36 @@
server:
http_listen_port: 3200 # Standard API/UI Port
distributor:
receivers: # OTLP receiver aktivieren (Tempo kann auch direkt empfangen)
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Grundlegende Konfiguration für Datenverarbeitung (meist ok für Start)
ingester:
trace_idle_period: 10s
max_block_bytes: 1048576 # 1MB
max_block_duration: 5m
compactor:
compaction:
block_retention: 1h # Wie lange Blöcke mindestens aufheben (geringer Wert für Test)
# WICHTIG: Storage explizit definieren!
storage:
trace:
backend: local # Backend-Typ: lokales Dateisystem
# Write Ahead Log (WAL) configuration.
wal:
path: /tmp/tempo/wal # Directory to store the the WAL locally.
# Local configuration for filesystem storage.
local:
path: /tmp/tempo/blocks # Directory to store the TSDB blocks.
# Pool used for finding trace IDs.
pool:
max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend.
queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched.

View File

@ -0,0 +1,226 @@
configs:
alloy-config-v3:
file: /srv/monitoring/config/alloy.v3.alloy
loki-config-v1:
file: /srv/monitoring/config/loki.v1.yml
prometheus-config-v3:
file: /srv/monitoring/config/prometheus.v3.yml
tempo-config-v1:
file: /srv/monitoring/config/tempo.v1.yml
volumes:
prometheus-data:
driver: local
grafana-data:
driver: local
loki-data:
driver: local
alloy-data:
driver: local
tempo-data:
driver: local
networks:
monitoring-net: # Internes Overlay-Netzwerk für die Monitoring-Komponenten
driver: overlay
attachable: true # Erlaubt anderen Containern/Stacks ggf. den Zugriff
traefik_public: # Das externe Netzwerk, auf dem Traefik lauscht
external: true # Wichtig: Dieses Netzwerk wird NICHT von diesem Stack erstellt
services:
prometheus:
image: prom/prometheus:latest
user: "65534:988"
volumes:
- prometheus-data:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
configs:
- source: prometheus-config-v3 # Versionierte Config
target: /etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.enable-remote-write-receiver'
networks:
- monitoring-net
- traefik_public # Nur wenn Traefik direkt auf Prometheus zugreifen soll (optional)
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager # Optional: An Manager-Nodes binden
labels:
- "traefik.enable=true"
# --- Router für Prometheus UI ---
- "traefik.http.routers.prometheus.rule=Host(`prometheus.genius.ceo`)"
- "traefik.http.routers.prometheus.entrypoints=https" # Entrypoint anpassen, falls anders
- "traefik.http.routers.prometheus.tls.certresolver=main" # CertResolver anpassen!
# --- Service für Prometheus UI ---
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
# --- Middleware (optional, z.B. für Authentifizierung) ---
# - "traefik.http.routers.prometheus.middlewares=my-auth-middleware"
# --- Netzwerk für Traefik ---
# WICHTIG: Das Netzwerk muss existieren und Traefik muss darauf lauschen.
- "traefik.swarm.network=traefik_public" # Traefik Netzwerkname anpassen!
loki:
image: grafana/loki:latest
volumes:
- loki-data:/loki
configs:
- source: loki-config-v1
target: /etc/loki/local-config.yaml
command: "-config.file=/etc/loki/local-config.yaml"
networks:
- monitoring-net
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
tempo:
image: grafana/tempo:latest # Aktuelles Tempo Image
volumes:
- tempo-data:/tmp/tempo # Persistenter Speicher für Traces (Standardpfad)
configs:
- source: tempo-config-v1
target: /etc/tempo/tempo.yaml
command: [ "-config.file=/etc/tempo/tempo.yaml" ]
user: root
# Tempo lauscht intern auf verschiedenen Ports für verschiedene Protokolle:
# - 4317 (OTLP gRPC - wird von Alloy genutzt)
# - 4318 (OTLP HTTP)
# - 14268 (Jaeger gRPC)
# - 3200 (Tempo HTTP Frontend/API - für Grafana & UI)
# Wir mappen sie vorerst nicht nach außen.
networks:
- monitoring-net
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager # Optional: An Manager-Nodes binden
grafana:
image: grafana/grafana:latest
volumes:
- grafana-data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin # Besser über Docker Secrets lösen!
# Weitere Grafana env vars nach Bedarf
networks:
- monitoring-net
- traefik_public # Nur wenn Traefik direkt auf Grafana zugreifen soll (optional)
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager # Optional: An Manager-Nodes binden
labels:
- "traefik.enable=true"
# --- Router für Grafana ---
- "traefik.http.routers.grafana.rule=Host(`grafana.genius.ceo`)"
- "traefik.http.routers.grafana.entrypoints=https" # Entrypoint anpassen, falls anders
- "traefik.http.routers.grafana.tls.certresolver=main" # CertResolver anpassen!
# --- Service für Grafana ---
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
# --- Middleware (optional) ---
# - "traefik.http.routers.grafana.middlewares=my-auth-middleware"
# --- Netzwerk für Traefik ---
- "traefik.swarm.network=traefik_public" # Traefik Netzwerkname anpassen!
alloy:
image: grafana/alloy:latest # Offizielles Alloy Image
volumes:
- alloy-data:/var/lib/alloy/data # Persistenter Speicher für Alloy (WAL etc.)
- /var/run/docker.sock:/var/run/docker.sock:ro # Für Docker Discovery
configs:
- source: alloy-config-v3
target: /etc/alloy/config.alloy # S3-Pfad für Alloy Config
environment:
- HOSTNAME=${HOSTNAME}
# Start mit root wegen Docker Socket / Volume Permissions, kann später optimiert werden (Socket Proxy)
# user: root
command: [
"run",
"--server.http.listen-addr=0.0.0.0:12345",
"/etc/alloy/config.alloy",
]
networks:
- monitoring-net
- traefik_public
deploy:
mode: global # WICHTIG: Alloy muss auf jedem Node laufen!
labels: # Traefik Labels für Alloy UI
- "traefik.enable=true"
# --- Router für Alloy UI ---
- "traefik.http.routers.alloy-ui.rule=Host(`otlp.genius.ceo`)"
- "traefik.http.routers.alloy-ui.entrypoints=https"
- "traefik.http.routers.alloy-ui.tls.certresolver=main"
- "traefik.http.routers.alloy-ui.service=alloy-ui@swarm"
# --- Service für Alloy UI ---
- "traefik.http.services.alloy-ui.loadbalancer.server.port=12345" # Ziel-Port ist 12345 (Alloy UI Standard)
# # --- Router für OTLP HTTP ---
# - "traefik.http.routers.otlp-http.rule=Host(`alloy.genius.ceo`)"
# - "traefik.http.routers.otlp-http.entrypoints=https"
# - "traefik.http.routers.otlp-http.tls.certresolver=main"
# - "traefik.http.routers.otlp-http.service=otlp-http@swarm"
# # --- Service für OTLP HTTP ---
# - "traefik.http.services.otlp-http.loadbalancer.server.port=4318" # Ziel-Port ist 4318 (OTLP HTTP Standard)
# --- Router für FARO RECEIVER ---
- "traefik.http.routers.faro-receiver.rule=Host(`alloy.genius.ceo`)"
- "traefik.http.routers.faro-receiver.entrypoints=https"
- "traefik.http.routers.faro-receiver.tls.certresolver=main"
- "traefik.http.routers.faro-receiver.service=faro-receiver@swarm"
# --- Service für FARO RECEIVER ---
- "traefik.http.services.faro-receiver.loadbalancer.server.port=12347" # Ziel-Port ist 12347 (FARO RECEIVER Standard)
# # --- Middlewares ---
# - "traefik.http.routers.otlp-http.middlewares=alloy-ratelimit@swarm"
# - "traefik.http.middlewares.alloy-ratelimit.ratelimit.average=100" # z.B. 100 Anfragen pro Sekunde
# - "traefik.http.middlewares.alloy-ratelimit.ratelimit.burst=50" # kurzfristig 50 mehr erlaubt
# --- Netzwerk für Traefik ---
- "traefik.swarm.network=traefik_public" # Traefik Netzwerkname prüfen/anpassen!
node-exporter:
image: quay.io/prometheus/node-exporter:latest # Aktuelles Image verwenden
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
networks:
- monitoring-net # Nur internes Netzwerk nötig
deploy:
mode: global # Läuft auf JEDEM Node im Swarm
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest # Google's cAdvisor Image
volumes:
# cAdvisor braucht Zugriff auf Host-System-Infos und Docker
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- monitoring-net # Nur internes Netzwerk nötig
deploy:
mode: global # Läuft auf JEDEM Node im Swarm
resources: # Optional: Limitiert Ressourcen, cAdvisor kann hungrig sein
limits:
memory: 512M
reservations:
memory: 256M

View File

@ -0,0 +1,102 @@
receivers:
hostmetrics:
collection_interval: 30s
root_path: /hostfs
scrapers:
cpu: {}
load: {}
memory: {}
disk: {}
filesystem: {}
network: {}
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-agent
static_configs:
- targets:
- localhost:8888
labels:
job_name: otel-agent
tcplog/docker:
listen_address: "0.0.0.0:2255"
operators:
- type: regex_parser
regex: '^<([0-9]+)>[0-9]+ (?P<timestamp>[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?) (?P<container_id>\S+) (?P<container_name>\S+) [0-9]+ - -( (?P<body>.*))?'
timestamp:
parse_from: attributes.timestamp
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
- type: move
from: attributes["body"]
to: body
- type: remove
field: attributes.timestamp
# please remove names from below if you want to collect logs from them
- type: filter
id: signoz_logs_filter
expr: 'attributes.container_name matches "^(signoz_(logspout|signoz|otel-collector|clickhouse|zookeeper))|(infra_(logspout|otel-agent|otel-metrics)).*"'
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
resourcedetection:
# Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
detectors:
# - ec2
# - gcp
# - azure
- env
- system
timeout: 2s
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
exporters:
otlp:
endpoint: ${env:SIGNOZ_COLLECTOR_ENDPOINT}
tls:
insecure: true
headers:
signoz-access-token: ${env:SIGNOZ_ACCESS_TOKEN}
# debug: {}
service:
telemetry:
logs:
encoding: json
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- pprof
pipelines:
traces:
receivers: [otlp]
processors: [resourcedetection, batch]
exporters: [otlp]
metrics:
receivers: [otlp]
processors: [resourcedetection, batch]
exporters: [otlp]
metrics/hostmetrics:
receivers: [hostmetrics]
processors: [resourcedetection, batch]
exporters: [otlp]
metrics/prometheus:
receivers: [prometheus]
processors: [resourcedetection, batch]
exporters: [otlp]
logs:
receivers: [otlp, tcplog/docker]
processors: [resourcedetection, batch]
exporters: [otlp]

View File

@ -0,0 +1,103 @@
receivers:
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-metrics
static_configs:
- targets:
- localhost:8888
labels:
job_name: otel-metrics
# For Docker daemon metrics to be scraped, it must be configured to expose
# Prometheus metrics, as documented here: https://docs.docker.com/config/daemon/prometheus/
# - job_name: docker-daemon
# dockerswarm_sd_configs:
# - host: unix:///var/run/docker.sock
# role: nodes
# relabel_configs:
# - source_labels: [__meta_dockerswarm_node_address]
# target_label: __address__
# replacement: $1:9323
- job_name: "dockerswarm"
dockerswarm_sd_configs:
- host: unix:///var/run/docker.sock
role: tasks
relabel_configs:
- action: keep
regex: running
source_labels:
- __meta_dockerswarm_task_desired_state
- action: keep
regex: true
source_labels:
- __meta_dockerswarm_service_label_signoz_io_scrape
- regex: ([^:]+)(?::\d+)?
replacement: $1
source_labels:
- __address__
target_label: swarm_container_ip
- separator: .
source_labels:
- __meta_dockerswarm_service_name
- __meta_dockerswarm_task_slot
- __meta_dockerswarm_task_id
target_label: swarm_container_name
- target_label: __address__
source_labels:
- swarm_container_ip
- __meta_dockerswarm_service_label_signoz_io_port
separator: ":"
- source_labels:
- __meta_dockerswarm_service_label_signoz_io_path
target_label: __metrics_path__
- source_labels:
- __meta_dockerswarm_service_label_com_docker_stack_namespace
target_label: namespace
- source_labels:
- __meta_dockerswarm_service_name
target_label: service_name
- source_labels:
- __meta_dockerswarm_task_id
target_label: service_instance_id
- source_labels:
- __meta_dockerswarm_node_hostname
target_label: host_name
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
resourcedetection:
detectors:
- env
- system
timeout: 2s
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
exporters:
otlp:
endpoint: ${env:SIGNOZ_COLLECTOR_ENDPOINT}
tls:
insecure: true
headers:
signoz-access-token: ${env:SIGNOZ_ACCESS_TOKEN}
# debug: {}
service:
telemetry:
logs:
encoding: json
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- pprof
pipelines:
metrics:
receivers: [prometheus]
processors: [resourcedetection, batch]
exporters: [otlp]

View File

@ -0,0 +1,78 @@
version: "3"
x-common: &common
networks:
- signoz-net
extra_hosts:
- host.docker.internal:host-gateway
logging:
options:
max-size: 50m
max-file: "3"
deploy:
mode: global
restart_policy:
condition: on-failure
services:
otel-agent:
<<: *common
image: otel/opentelemetry-collector-contrib:0.111.0
command:
- --config=/etc/otel-collector-config.yaml
configs:
- source: otel-agent-config-v1
target: /etc/otel-collector-config.yaml
volumes:
- /:/hostfs:ro
environment:
- SIGNOZ_COLLECTOR_ENDPOINT=http://host.docker.internal:4317 # In case of external SigNoz or cloud, update the endpoint and access token
- OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
# - SIGNOZ_ACCESS_TOKEN="<your-access-token>"
# Before exposing the ports, make sure the ports are not used by other services
# ports:
# - "4317:4317"
# - "4318:4318"
otel-metrics:
<<: *common
image: otel/opentelemetry-collector-contrib:0.111.0
user: 0:0 # If you have security concerns, you can replace this with your `UID:GID` that has necessary permissions to docker.sock
command:
- --config=/etc/otel-collector-config.yaml
configs:
- source: otel-metrics-config-v1
target: /etc/otel-collector-config.yaml
volumes:
- /var/run/docker.sock:/var/run/docker.sock
environment:
- SIGNOZ_COLLECTOR_ENDPOINT=http://host.docker.internal:4317 # In case of external SigNoz or cloud, update the endpoint and access token
- OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
# - SIGNOZ_ACCESS_TOKEN="<your-access-token>"
# Before exposing the ports, make sure the ports are not used by other services
# ports:
# - "4317:4317"
# - "4318:4318"
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
logspout:
<<: *common
image: "gliderlabs/logspout:v3.2.14"
command: syslog+tcp://otel-agent:2255
user: root
volumes:
- /etc/hostname:/etc/host_hostname:ro
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- otel-agent
networks:
signoz-net:
name: signoz-net
external: true
configs:
otel-metrics-config-v1:
file: /mnt/cephfs/signoz-infra/config/otel-metrics-config.v1.yaml
otel-agent-config-v1:
file: /mnt/cephfs/signoz-infra/config/otel-agent-config.v1.yaml

View File

@ -0,0 +1,75 @@
<?xml version="1.0"?>
<clickhouse>
<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
Optional. If you don't use replicated tables, you could omit that.
See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
-->
<zookeeper>
<node index="1">
<host>zookeeper-1</host>
<port>2181</port>
</node>
<!-- <node index="2">
<host>zookeeper-2</host>
<port>2181</port>
</node>
<node index="3">
<host>zookeeper-3</host>
<port>2181</port>
</node> -->
</zookeeper>
<!-- Configuration of clusters that could be used in Distributed tables.
https://clickhouse.com/docs/en/operations/table_engines/distributed/
-->
<remote_servers>
<cluster>
<!-- Inter-server per-cluster secret for Distributed queries
default: no secret (no authentication will be performed)
If set, then Distributed queries will be validated on shards, so at least:
- such cluster should exist on the shard,
- such cluster should have the same secret.
And also (and which is more important), the initial_user will
be used as current user for the query.
Right now the protocol is pretty simple and it only takes into account:
- cluster name
- query
Also it will be nice if the following will be implemented:
- source hostname (see interserver_http_host), but then it will depends from DNS,
it can use IP address instead, but then the you need to get correct on the initiator node.
- target hostname / ip address (same notes as for source hostname)
- time-based security tokens
-->
<!-- <secret></secret> -->
<shard>
<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
<!-- <internal_replication>false</internal_replication> -->
<!-- Optional. Shard weight when writing data. Default: 1. -->
<!-- <weight>1</weight> -->
<replica>
<host>clickhouse</host>
<port>9000</port>
<!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
<!-- <priority>1</priority> -->
</replica>
</shard>
<!-- <shard>
<replica>
<host>clickhouse-2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>clickhouse-3</host>
<port>9000</port>
</replica>
</shard> -->
</cluster>
</remote_servers>
</clickhouse>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
<functions>
<function>
<type>executable</type>
<name>histogramQuantile</name>
<return_type>Float64</return_type>
<argument>
<type>Array(Float64)</type>
<name>buckets</name>
</argument>
<argument>
<type>Array(Float64)</type>
<name>counts</name>
</argument>
<argument>
<type>Float64</type>
<name>quantile</name>
</argument>
<format>CSV</format>
<command>./histogramQuantile</command>
</function>
</functions>

View File

@ -0,0 +1,41 @@
<?xml version="1.0"?>
<clickhouse>
<storage_configuration>
<disks>
<default>
<keep_free_space_bytes>10485760</keep_free_space_bytes>
</default>
<s3>
<type>s3</type>
<!-- For S3 cold storage,
if region is us-east-1, endpoint can be https://<bucket-name>.s3.amazonaws.com
if region is not us-east-1, endpoint should be https://<bucket-name>.s3-<region>.amazonaws.com
For GCS cold storage,
endpoint should be https://storage.googleapis.com/<bucket-name>/data/
-->
<endpoint>https://BUCKET-NAME.s3-REGION-NAME.amazonaws.com/data/</endpoint>
<access_key_id>ACCESS-KEY-ID</access_key_id>
<secret_access_key>SECRET-ACCESS-KEY</secret_access_key>
<!-- In case of S3, uncomment the below configuration in case you want to read
AWS credentials from the Environment variables if they exist. -->
<!-- <use_environment_credentials>true</use_environment_credentials> -->
<!-- In case of GCS, uncomment the below configuration, since GCS does
not support batch deletion and result in error messages in logs. -->
<!-- <support_batch_delete>false</support_batch_delete> -->
</s3>
</disks>
<policies>
<tiered>
<volumes>
<default>
<disk>default</disk>
</default>
<s3>
<disk>s3</disk>
<perform_ttl_move_on_insert>0</perform_ttl_move_on_insert>
</s3>
</volumes>
</tiered>
</policies>
</storage_configuration>
</clickhouse>

View File

@ -0,0 +1,123 @@
<?xml version="1.0"?>
<clickhouse>
<!-- See also the files in users.d directory where the settings can be overridden. -->
<!-- Profiles of settings. -->
<profiles>
<!-- Default settings. -->
<default>
<!-- Maximum memory usage for processing single query, in bytes. -->
<max_memory_usage>10000000000</max_memory_usage>
<!-- How to choose between replicas during distributed query processing.
random - choose random replica from set of replicas with minimum number of errors
nearest_hostname - from set of replicas with minimum number of errors, choose replica
with minimum number of different symbols between replica's hostname and local hostname
(Hamming distance).
in_order - first live replica is chosen in specified order.
first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
-->
<load_balancing>random</load_balancing>
</default>
<!-- Profile that allows only read queries. -->
<readonly>
<readonly>1</readonly>
</readonly>
</profiles>
<!-- Users and ACL. -->
<users>
<!-- If user name was not specified, 'default' user is used. -->
<default>
<!-- See also the files in users.d directory where the password can be overridden.
Password could be specified in plaintext or in SHA256 (in hex format).
If you want to specify password in plaintext (not recommended), place it in 'password' element.
Example: <password>qwerty</password>.
Password could be empty.
If you want to specify SHA256, place it in 'password_sha256_hex' element.
Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
place its name in 'server' element inside 'ldap' element.
Example: <ldap><server>my_ldap_server</server></ldap>
If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
place 'kerberos' element instead of 'password' (and similar) elements.
The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
whose initiator's realm matches it.
Example: <kerberos />
Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
How to generate decent password:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
In first line will be password and in second - corresponding SHA256.
How to generate double SHA1:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
In first line will be password and in second - corresponding double SHA1.
-->
<password></password>
<!-- List of networks with open access.
To open access from everywhere, specify:
<ip>::/0</ip>
To open access only from localhost, specify:
<ip>::1</ip>
<ip>127.0.0.1</ip>
Each element of list has one of the following forms:
<ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
<host> Hostname. Example: server01.clickhouse.com.
To check access, DNS query is performed, and all received addresses compared to peer address.
<host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
To check access, DNS PTR query is performed for peer address and then regexp is applied.
Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
Strongly recommended that regexp is ends with $
All results of DNS requests are cached till server restart.
-->
<networks>
<ip>::/0</ip>
</networks>
<!-- Settings profile for user. -->
<profile>default</profile>
<!-- Quota for user. -->
<quota>default</quota>
<!-- User can create other users and grant rights to them. -->
<!-- <access_management>1</access_management> -->
</default>
</users>
<!-- Quotas. -->
<quotas>
<!-- Name of quota. -->
<default>
<!-- Limits for time interval. You could specify many intervals with different limits. -->
<interval>
<!-- Length of interval. -->
<duration>3600</duration>
<!-- No limits. Just calculate resource usage for time interval. -->
<queries>0</queries>
<errors>0</errors>
<result_rows>0</result_rows>
<read_rows>0</read_rows>
<execution_time>0</execution_time>
</interval>
</default>
</quotas>
</clickhouse>

View File

@ -0,0 +1,140 @@
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- https://*.genius.ceo
- https://*.avicenna.hamburg
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-collector
static_configs:
- targets:
- localhost:8888
labels:
job_name: otel-collector
docker_stats:
endpoint: unix:///var/run/docker.sock
metrics:
container.cpu.utilization:
enabled: true
container.memory.percent:
enabled: true
container.network.io.usage.rx_bytes:
enabled: true
container.network.io.usage.tx_bytes:
enabled: true
container.network.io.usage.rx_dropped:
enabled: true
container.network.io.usage.tx_dropped:
enabled: true
container.memory.usage.limit:
enabled: true
container.memory.usage.total:
enabled: true
container.blockio.io_service_bytes_recursive:
enabled: true
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
resourcedetection:
# Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels.
detectors: [env, system]
timeout: 2s
resourcedetection/docker:
detectors: [env, docker]
timeout: 2s
override: false
signozspanmetrics/delta:
metrics_exporter: clickhousemetricswrite, signozclickhousemetrics
metrics_flush_interval: 60s
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ]
dimensions_cache_size: 100000
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
enable_exp_histogram: true
dimensions:
- name: service.namespace
default: default
- name: deployment.environment
default: default
# This is added to ensure the uniqueness of the timeseries
# Otherwise, identical timeseries produced by multiple replicas of
# collectors result in incorrect APM metrics
- name: signoz.collector.id
- name: service.version
- name: browser.platform
- name: browser.mobile
- name: k8s.cluster.name
- name: k8s.node.name
- name: k8s.namespace.name
- name: host.name
- name: host.type
- name: container.name
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/signoz_traces
low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
use_new_schema: true
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/signoz_metrics
resource_to_telemetry_conversion:
enabled: true
disable_v2: true
clickhousemetricswrite/prometheus:
endpoint: tcp://clickhouse:9000/signoz_metrics
disable_v2: true
signozclickhousemetrics:
dsn: tcp://clickhouse:9000/signoz_metrics
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/signoz_logs
timeout: 10s
use_new_schema: true
otlp:
endpoint: http://otel-collector:4317
tls:
insecure: true
# debug: {}
service:
telemetry:
logs:
encoding: json
metrics:
address: 0.0.0.0:8888
extensions:
- health_check
- pprof
pipelines:
traces:
receivers: [otlp]
processors: [signozspanmetrics/delta, batch]
exporters: [clickhousetraces]
metrics/docker:
receivers: [docker_stats]
processors: [resourcedetection/docker]
exporters: [otlp]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [clickhousemetricswrite, signozclickhousemetrics]
metrics/prometheus:
receivers: [prometheus]
processors: [batch]
exporters: [clickhousemetricswrite/prometheus, signozclickhousemetrics]
logs:
receivers: [otlp]
processors: [batch]
exporters: [clickhouselogsexporter]

View File

@ -0,0 +1 @@
server_endpoint: ws://signoz:4320/v1/opamp

View File

@ -0,0 +1,25 @@
# my global config
global:
scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: []
# - "first_rules.yml"
# - "second_rules.yml"
# - 'alerts.yml'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs: []
remote_read:
- url: tcp://clickhouse:9000/signoz_metrics

View File

@ -0,0 +1,243 @@
version: '3'
x-common: &common
networks:
- signoz-net
deploy:
restart_policy:
condition: on-failure
logging:
options:
max-size: 50m
max-file: '3'
x-clickhouse-defaults: &clickhouse-defaults
!!merge <<: *common
image: clickhouse/clickhouse-server:24.1.2-alpine
tty: true
user: "1000:1000"
deploy:
placement:
constraints: [node.hostname == manager-node-3]
labels:
signoz.io/scrape: 'true'
signoz.io/port: '9363'
signoz.io/path: '/metrics'
depends_on:
- init-clickhouse
- zookeeper-1
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- 0.0.0.0:8123/ping
interval: 30s
timeout: 5s
retries: 3
ulimits:
nproc: 65535
nofile:
soft: 262144
hard: 262144
x-zookeeper-defaults: &zookeeper-defaults
!!merge <<: *common
image: bitnami/zookeeper:3.7.1
user: root
deploy:
placement:
constraints: [node.hostname == manager-node-1]
labels:
signoz.io/scrape: 'true'
signoz.io/port: '9141'
signoz.io/path: '/metrics'
healthcheck:
test:
- CMD-SHELL
- curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
interval: 30s
timeout: 5s
retries: 3
x-db-depend: &db-depend
!!merge <<: *common
depends_on:
- clickhouse
- schema-migrator
services:
init-clickhouse:
!!merge <<: *common
image: clickhouse/clickhouse-server:24.1.2-alpine
command:
- bash
- -c
- |
version="v0.0.1"
node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
cd /tmp
wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
tar -xvzf histogram-quantile.tar.gz
mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
deploy:
restart_policy:
condition: on-failure
volumes:
- /mnt/cephfs/signoz/data/clickhouse/user_scripts:/var/lib/clickhouse/user_scripts/
zookeeper-1:
!!merge <<: *zookeeper-defaults
# ports:
# - "2181:2181"
# - "2888:2888"
# - "3888:3888"
volumes:
- /mnt/cephfs/signoz/data/zookeeper-1:/bitnami/zookeeper
environment:
- ZOO_SERVER_ID=1
- ALLOW_ANONYMOUS_LOGIN=yes
- ZOO_AUTOPURGE_INTERVAL=1
- ZOO_ENABLE_PROMETHEUS_METRICS=yes
- ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141
clickhouse:
!!merge <<: *clickhouse-defaults
# TODO: needed for clickhouse TCP connectio
hostname: clickhouse
# ports:
# - "9000:9000"
# - "8123:8123"
# - "9181:9181"
configs:
- source: clickhouse-config-v1
target: /etc/clickhouse-server/config.xml
- source: clickhouse-users-v1
target: /etc/clickhouse-server/users.xml
- source: clickhouse-custom-function-v1
target: /etc/clickhouse-server/custom-function.xml
- source: clickhouse-cluster-v1
target: /etc/clickhouse-server/config.d/cluster.xml
volumes:
- /mnt/cephfs/signoz/data/clickhouse/data/user_scripts:/var/lib/clickhouse/user_scripts/
- /mnt/cephfs/signoz/data/clickhouse/data:/var/lib/clickhouse/
# - ../common/clickhouse/storage.xml:/etc/clickhouse-server/config.d/storage.xml
signoz:
!!merge <<: *db-depend
image: signoz/signoz:v0.86.1
command:
- --config=/root/config/prometheus.yml
# ports:
# - "8080:8080" # signoz port
# - "6060:6060" # pprof port
configs:
- source: signoz-prometheus-config-v1
target: /root/config/prometheus.yml
volumes:
- /mnt/cephfs/signoz/data/dashboards:/root/config/dashboards
- /mnt/cephfs/signoz/data/sqlite:/var/lib/signoz/
environment:
- SIGNOZ_ALERTMANAGER_PROVIDER=signoz
- SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
- SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
- DASHBOARDS_PATH=/root/config/dashboards
- STORAGE=clickhouse
- GODEBUG=netdns=go
- TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-swarm
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- localhost:8080/api/v1/health
interval: 30s
timeout: 5s
retries: 3
networks:
- signoz-net
- traefik_public
deploy:
labels:
- 'traefik.enable=true'
# --- Router für Signoz UI ---
- 'traefik.http.routers.signoz.rule=Host(`signoz.genius.ceo`)'
- 'traefik.http.routers.signoz.entrypoints=https'
- 'traefik.http.routers.signoz.tls.certresolver=main'
# --- Service für Signoz UI ---
- 'traefik.http.services.signoz.loadbalancer.server.port=8080'
# --- Netzwerk für Traefik ---
- 'traefik.swarm.network=traefik_public'
otel-collector:
!!merge <<: *db-depend
image: signoz/signoz-otel-collector:v0.111.42
user: root
command:
- --config=/etc/otel-collector-config.yaml
- --manager-config=/etc/manager-config.yaml
- --copy-path=/var/tmp/collector-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
configs:
- source: otel-collector-config-v4
target: /etc/otel-collector-config.yaml
- source: otel-collector-manager-config-v1
target: /etc/manager-config.yaml
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
environment:
- OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
- LOW_CARDINAL_EXCEPTION_GROUPING=false
ports:
# - "1777:1777" # pprof extension
- '4317:4317' # OTLP gRPC receiver
- '4318:4318' # OTLP HTTP receiver
deploy:
replicas: 3
labels:
- 'traefik.enable=true'
# --- Router für Signoz Collector UI ---
- 'traefik.http.routers.signoz-collector.rule=Host(`collector.genius.ceo`)'
- 'traefik.http.routers.signoz-collector.entrypoints=https'
- 'traefik.http.routers.signoz-collector.tls.certresolver=main'
# --- Service für Signoz Collector UI ---
- 'traefik.http.services.signoz-collector.loadbalancer.server.port=4318'
# --- Netzwerk für Traefik ---
- 'traefik.swarm.network=traefik_public'
depends_on:
- clickhouse
- schema-migrator
- signoz
networks:
- signoz-net
- traefik_public
schema-migrator:
!!merge <<: *common
image: signoz/signoz-schema-migrator:v0.111.42
deploy:
restart_policy:
condition: on-failure
delay: 5s
entrypoint: sh
command:
- -c
- '/signoz-schema-migrator sync --dsn=tcp://clickhouse:9000 --up= && /signoz-schema-migrator async --dsn=tcp://clickhouse:9000 --up='
depends_on:
- clickhouse
networks:
signoz-net:
name: signoz-net
attachable: true
traefik_public:
external: true
configs:
otel-collector-config-v4:
file: /mnt/cephfs/signoz/config/otel-collector-config.v4.yaml
otel-collector-manager-config-v1:
file: /mnt/cephfs/signoz/config/signoz/otel-collector-opamp-config.yaml
clickhouse-config-v1:
file: /mnt/cephfs/signoz/config/clickhouse/config.v1.xml
clickhouse-users-v1:
file: /mnt/cephfs/signoz/config/clickhouse/users.v1.xml
clickhouse-custom-function-v1:
file: /mnt/cephfs/signoz/config/clickhouse/custom-function.v1.xml
clickhouse-cluster-v1:
file: /mnt/cephfs/signoz/config/clickhouse/cluster.v1.xml
signoz-prometheus-config-v1:
file: /mnt/cephfs/signoz/config/signoz/prometheus.v1.yml

View File

@ -0,0 +1,33 @@
---
# - name: AUTHENTIK | Verzeichnisse erstellen und Berechtigungen setzen
# ansible.builtin.file:
# path: "/mnt/cephfs/authentik/data/{{ item }}"
# state: directory
# owner: 1000
# group: 1000
# mode: '0755'
# loop:
# - cache
# - certs
# - db
# - media
# - templates
# run_once: true
# delegate_to: "{{ groups['managers'][0] }}"
- name: AUTHENTIK | Generate Compose file
ansible.builtin.template:
src: docker-compose.yml.j2
dest: /mnt/cephfs/authentik/authentik.yml
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: AUTHENTIK | Deploy app stack
community.docker.docker_stack:
state: present
name: authentik
compose:
- /mnt/cephfs/authentik/authentik.yml
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,100 @@
---
networks:
traefik_public:
external: true
internal:
services:
postgresql:
image: docker.io/library/postgres:16-alpine
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "pg_isready -d $POSTGRES_DB -U $POSTGRES_USER"]
start_period: 20s
interval: 30s
retries: 5
timeout: 5s
volumes:
- /mnt/cephfs/authentik/data/db:/var/lib/postgresql/data
environment:
POSTGRES_PASSWORD: "{{ pg_pass }}"
POSTGRES_USER: "{{ pg_user | default('authentik') }}"
POSTGRES_DB: "{{ pg_db | default('authentik') }}"
networks:
- internal
redis:
image: docker.io/library/redis:alpine
command: --save 60 1 --loglevel warning
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "redis-cli ping | grep PONG"]
start_period: 20s
interval: 30s
retries: 5
timeout: 3s
volumes:
- /mnt/cephfs/authentik/data/cache:/data
networks:
- internal
server:
image: "{{ authentik_image | default('ghcr.io/goauthentik/server') }}:{{ authentik_tag | default('2025.6.3') }}"
restart: unless-stopped
command: server
environment:
AUTHENTIK_SECRET_KEY: "{{ authentik_secret_key }}"
AUTHENTIK_REDIS__HOST: redis
AUTHENTIK_POSTGRESQL__HOST: postgresql
AUTHENTIK_POSTGRESQL__USER: "{{ pg_user | default('authentik') }}"
AUTHENTIK_POSTGRESQL__NAME: "{{ pg_db | default('authentik') }}"
AUTHENTIK_POSTGRESQL__PASSWORD: "{{ pg_pass }}"
AUTHENTIK_ERROR_REPORTING__ENABLED: "false"
volumes:
- /mnt/cephfs/authentik/data/media:/media
- /mnt/cephfs/authentik/data/templates:/templates
networks:
- traefik_public
- internal
deploy:
labels:
traefik.enable: "true"
traefik.swarm.network: {{ traefik_net }}
traefik.http.routers.authentik.rule: Host(`{{ traefik_route }}`) || HostRegexp(`{subdomain:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?}.genius.ceo`) && PathPrefix(`/outpost.goauthentik.io/`)
traefik.http.routers.authentik.entrypoints: https
traefik.http.routers.authentik.tls: "true"
traefik.http.routers.authentik.tls.certresolver: main
traefik.http.services.authentik.loadbalancer.server.port: 9000
# - "traefik.enable=true"
# - "traefik.swarm.network={{ traefik_net }}"
# - "traefik.http.routers.authentik.rule=Host(`{{ traefik_route }}`) || HostRegexp(`{subdomain:[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?}.genius.ceo`) && PathPrefix(`/outpost.goauthentik.io/`)"
# - "traefik.http.routers.authentik.entrypoints=https"
# - "traefik.http.routers.authentik.tls=true"
# - "traefik.http.routers.authentik.tls.certresolver=main"
# - "traefik.http.services.authentik.loadbalancer.server.port=9000"
worker:
image: "{{ authentik_image | default('ghcr.io/goauthentik/server') }}:{{ authentik_tag | default('2025.6.3') }}"
restart: unless-stopped
command: worker
environment:
AUTHENTIK_SECRET_KEY: "{{ authentik_secret_key }}"
AUTHENTIK_REDIS__HOST: redis
AUTHENTIK_POSTGRESQL__HOST: postgresql
AUTHENTIK_POSTGRESQL__USER: "{{ pg_user | default('authentik') }}"
AUTHENTIK_POSTGRESQL__NAME: "{{ pg_db | default('authentik') }}"
AUTHENTIK_POSTGRESQL__PASSWORD: "{{ pg_pass }}"
# `user: root` and the docker socket volume are optional.
# See more for the docker socket integration here:
# https://goauthentik.io/docs/outposts/integrations/docker
# Removing `user: root` also prevents the worker from fixing the permissions
# on the mounted folders, so when removing this make sure the folders have the correct UID/GID
# (1000:1000 by default)
user: root
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /mnt/cephfs/authentik/data/media:/media
- /mnt/cephfs/authentik/data/certs:/certs
- /mnt/cephfs/authentik/data/templates:/templates
networks:
- internal

View File

@ -0,0 +1,11 @@
---
authentik_image: "ghcr.io/goauthentik/server"
authentik_tag: "2025.6.3"
authentik_secret_key: ""
pg_user: "authentik"
pg_pass: ""
pg_db: "authentik"
traefik_net: "traefik_public"
traefik_route: "auth.genius.ceo"

View File

@ -0,0 +1,93 @@
---
- name: CEPH | Private IP des ersten Managers ermitteln
ansible.builtin.set_fact:
ceph_bootstrap_ip: "{{ hostvars[inventory_hostname]['ansible_' + private_interface]['ipv4']['address'] }}"
when: inventory_hostname == groups['managers'][0]
- name: CEPH | Cluster auf dem ersten Manager initialisieren (Bootstrap)
ansible.builtin.command:
cmd: "cephadm bootstrap --mon-ip {{ ceph_bootstrap_ip }}"
creates: /etc/ceph/ceph.conf
when: inventory_hostname == groups['managers'][0]
- name: CEPH | Öffentlichen SSH-Schlüssel von cephadm abrufen
ansible.builtin.command: "cephadm shell -- ceph cephadm get-pub-key"
register: cephadm_pub_key
changed_when: false
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | Öffentlichen Schlüssel von cephadm auf allen Knoten für root verteilen
ansible.posix.authorized_key:
user: root
key: "{{ hostvars[groups['managers'][0]]['cephadm_pub_key'].stdout }}"
state: present
key_options: 'no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty'
- name: CEPH | Andere Knoten zum Ceph-Cluster hinzufügen
ansible.builtin.command:
cmd: "ceph orch host add {{ item }} {{ hostvars[item]['ansible_' + private_interface]['ipv4']['address'] }}"
loop: "{{ groups['all'] }}"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | Prüfen, ob bereits OSDs (Speichergeräte) vorhanden sind
ansible.builtin.command: "ceph osd ls"
register: existing_osds
changed_when: false
failed_when: false
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | Spezifische Festplatte ({{ ceph_osd_device }}) auf jedem Knoten als OSD hinzufügen
ansible.builtin.command: "ceph orch daemon add osd {{ item }}:{{ ceph_osd_device }}"
loop: "{{ groups['all'] }}"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
when: existing_osds.stdout | length == 0
- name: CEPH | Prüfen, ob CephFS bereits existiert
ansible.builtin.command: "ceph fs ls -f json"
register: cephfs_list
changed_when: false
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | CephFS Pools und Dateisystem erstellen, falls nicht vorhanden
block:
- name: Metadaten-Pool für CephFS erstellen
ansible.builtin.command: "ceph osd pool create {{ cephfs_name }}_metadata"
- name: Daten-Pool für CephFS erstellen
ansible.builtin.command: "ceph osd pool create {{ cephfs_name }}_data"
- name: CephFS-Dateisystem erstellen
ansible.builtin.command: "ceph fs new {{ cephfs_name }} {{ cephfs_name }}_metadata {{ cephfs_name }}_data"
when: cephfs_list.stdout | from_json | length == 0
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | Metadaten-Server (MDS) für CephFS starten
ansible.builtin.command: "ceph orch apply mds {{ cephfs_name }} --placement=2"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
when: cephfs_list.stdout | from_json | length == 0
- name: CEPH | Ceph Admin-Schlüssel für das Mounten abrufen
ansible.builtin.command: "ceph auth get-key client.admin"
register: ceph_admin_key
changed_when: false
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: CEPH | Mount-Punkt für CephFS erstellen
ansible.builtin.file:
path: /mnt/cephfs
state: directory
mode: '0755'
- name: CEPH | CephFS auf allen Knoten mounten (und in /etc/fstab eintragen)
ansible.posix.mount:
path: /mnt/cephfs
src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/"
fstype: ceph
opts: "name=admin,secret={{ ceph_admin_key.stdout }}"
state: mounted

View File

@ -0,0 +1,108 @@
---
- name: COMMON | Systempakete aktualisieren und upgraden
ansible.builtin.apt:
update_cache: true
upgrade: dist
autoremove: true
autoclean: true
- name: COMMON | Notwendige Pakete installieren
ansible.builtin.apt:
name:
- ufw
- fail2ban
- unattended-upgrades
- apt-listchanges
- docker-ce
- python3-pip
- chrony
- lvm2
- cephadm
- ceph-common
state: present
- name: COMMON | Chrony Dienst starten und aktivieren
ansible.builtin.service:
name: chronyd
state: started
enabled: true
- name: COMMON | Docker Dienst starten und aktivieren
ansible.builtin.service:
name: docker
state: started
enabled: true
- name: COMMON | Einen dedizierten Admin-Benutzer erstellen
ansible.builtin.user:
name: "{{ admin_user }}"
password: "{{ admin_password}}"
shell: /bin/bash
groups: sudo,docker
append: true
state: present
- name: COMMON | SSH-Schlüssel für den Admin-Benutzer einrichten
ansible.posix.authorized_key:
user: "{{ admin_user }}"
key: "{{ item }}"
state: present
with_items: "{{ authorized_keys }}"
- name: COMMON | cephadm-Benutzer erstellen
ansible.builtin.user:
name: "cephadm"
password: "{{ cephadm_password }}"
shell: /bin/bash
groups: sudo,docker
append: yes
state: present
- name: COMMON | .ssh Verzeichnis für cephadm-Benutzer erstellen
ansible.builtin.file:
path: /home/cephadm/.ssh
state: directory
- name: COMMON | Passwortloses Sudo für cephadm-Benutzer erlauben
ansible.builtin.copy:
dest: "/etc/sudoers.d/91-cephadm-nopasswd"
content: "cephadm ALL=(ALL) NOPASSWD: ALL"
mode: '0440'
validate: 'visudo -cf %s'
- name: COMMON | ed25519 SSH-Schlüssel für cephadm-Benutzer generieren (nur auf dem ersten Manager)
community.crypto.openssh_keypair:
path: /home/cephadm/.ssh/id_ed25519
type: ed25519
owner: cephadm
group: cephadm
mode: '0600'
when: inventory_hostname == groups['managers'][0]
- name: COMMON | Öffentlichen SSH-Schlüssel von cephadm abrufen
ansible.builtin.slurp:
src: /home/cephadm/.ssh/id_ed25519.pub
register: cephadm_ssh_pub_key
when: inventory_hostname == groups['managers'][0]
- name: COMMON | Öffentlichen SSH-Schlüssel von cephadm auf allen Knoten verteilen
ansible.posix.authorized_key:
user: cephadm
key: "{{ hostvars[groups['managers'][0]]['cephadm_ssh_pub_key']['content'] | b64decode }}"
state: present
- name: COMMON | Automatische Sicherheitsupdates konfigurieren
ansible.builtin.copy:
src: assets/50unattended-upgrades
dest: /etc/apt/apt.conf.d/50unattended-upgrades
owner: root
group: root
mode: '0644'
- name: COMMON | Periodische Auto-Updates aktivieren
ansible.builtin.copy:
src: assets/20auto-upgrades
dest: /etc/apt/apt.conf.d/20auto-upgrades
owner: root
group: root
mode: '0644'

View File

@ -0,0 +1,58 @@
---
- name: SWARM | Ensure Docker SDK for Python is installed
ansible.builtin.apt:
name: python3-docker
state: present
- name: SWARM | Get interface IP address for the manager
ansible.builtin.set_fact:
manager_ip: "{{ hostvars[inventory_hostname]['ansible_' + private_interface]['ipv4']['address'] }}"
when: inventory_hostname == groups['managers'][0]
- name: SWARM | Initialize the Docker Swarm
community.docker.docker_swarm:
state: present
advertise_addr: "{{ manager_ip }}"
when: inventory_hostname == groups['managers'][0]
register: swarm_init_result
- name: SWARM | Get the join tokens
community.docker.docker_swarm_info:
register: swarm_info
when: inventory_hostname == groups['managers'][0]
- name: SWARM | Verify that join tokens were fetched
ansible.builtin.assert:
that:
- swarm_info is defined
- swarm_info.swarm_facts is defined
- swarm_info.swarm_facts.JoinTokens.Manager is defined
- swarm_info.swarm_facts.JoinTokens.Worker is defined
fail_msg: "Konnte die Join-Tokens vom Swarm Manager nicht abrufen. Ist der Swarm korrekt initialisiert?"
success_msg: "Join-Tokens erfolgreich abgerufen."
when: inventory_hostname == groups['managers'][0]
- name: SWARM | Join manager nodes to the Swarm
community.docker.docker_swarm:
state: join
remote_addrs: [ "{{ hostvars[groups['managers'][0]]['manager_ip'] }}:2377" ]
join_token: "{{ hostvars[groups['managers'][0]]['swarm_info']['swarm_facts']['JoinTokens']['Manager'] }}"
when: inventory_hostname in groups['managers']
- name: SWARM | Join worker nodes to the Swarm
community.docker.docker_swarm:
state: join
remote_addrs: [ "{{ hostvars[groups['managers'][0]]['manager_ip'] }}:2377" ]
join_token: "{{ hostvars[groups['managers'][0]]['swarm_info']['swarm_facts']['JoinTokens']['Worker'] }}"
when: inventory_hostname in groups['workers']
- name: SWARM | Verify Swarm Cluster State (run on manager)
ansible.builtin.command: docker node ls
register: swarm_nodes
changed_when: false
when: inventory_hostname == groups['managers'][0]
- name: SWARM | Display cluster state
ansible.builtin.debug:
msg: "{{ swarm_nodes.stdout_lines }}"
when: inventory_hostname == groups['managers'][0]

View File

@ -0,0 +1,3 @@
---
dockge_stacks_dir: /mnt/cephfs/dockge/stacks
dockge_data_dir: /mnt/cephfs/dockge/data

View File

@ -0,0 +1,42 @@
---
- name: DOCKGE | Copy Stack Files
copy:
directory_mode: true
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/dockge
dest: /mnt/cephfs
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
# - name: DOCKGE | Sicherstellen, dass das Verzeichnis für die Anwendungs Daten existiert
# ansible.builtin.file:
# path: "{{ dockge_data_dir }}"
# state: directory
# owner: root
# group: root
# mode: '0755'
# become: true
# - name: DOCKGE | Sicherstellen, dass das Verzeichnis für die Stacks existiert
# ansible.builtin.file:
# path: "{{ dockge_stacks_dir }}"
# state: directory
# owner: root
# group: root
# mode: '0755'
# become: true
# - name: DOCKGE | Stack aus der Template-Datei bereitstellen
# community.docker.docker_stack:
# state: present
# name: dockge
# compose:
# - "{{ lookup('template', '../../../resources/dockge/dockge.yml') }}"
# delegate_to: "{{ groups['managers'][0] }}"
# run_once: true
- name: DOCKGE | Deploy app stack
community.docker.docker_stack:
state: present
name: dockge
compose:
- /mnt/cephfs/dockge/dockge.yml
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,9 @@
---
- name: FAIL2BAN | Eine lokale Jail-Konfiguration erstellen
ansible.builtin.template:
src: jail.local.j2
dest: /etc/fail2ban/jail.local
owner: root
group: root
mode: '0644'
notify: restart fail2ban

View File

@ -0,0 +1,8 @@
[DEFAULT]
bantime = 1h
findtime = 10m
maxretry = 5
[sshd]
enabled = true
port = {{ ssh_port }}

View File

@ -0,0 +1,14 @@
postgres_version: 16-alpine
gitea_version: "1.21"
gitea_domain: "{{ subdomain }}.{{ main_domain }}"
gitea_http_port: 3000
gitea_ssh_port: 2222
data_dir: "{{ ceph_volume }}/gitea"
subdomain: git
gitea_db_type: "postgres"
gitea_db_host: db:5432
gitea_db_name: "gitea"
gitea_db_user: "gitea"
gitea_db_password: ""

View File

@ -0,0 +1,38 @@
- name: GITEA | Ensure data directories
ansible.builtin.file:
path: '{{ data_dir }}/data'
state: directory
owner: 1000
group: 1000
mode: '0750'
recurse: yes
delegate_to: "{{ groups['managers'][0] }}"
- name: GITEA | Ensure DB data directories
ansible.builtin.file:
path: "{{ data_dir }}/data/db"
state: directory
# Postgres Alpine nutzt UID 70 (postgres).
# Bei Debian-Images wäre es 999.
owner: 70
group: 70
mode: '0700'
recurse: yes
delegate_to: "{{ groups['managers'][0] }}"
- name: GITEA | Generate Compose file
ansible.builtin.template:
src: docker-compose.yml.j2
dest: '{{ data_dir }}/gitea.yml'
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: GITEA | Deploy stack
community.docker.docker_stack:
state: present
name: gitea
compose:
- '{{ data_dir }}/gitea.yml'
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,62 @@
networks:
{{ traefik_public_net }}:
external: true
internal:
services:
server:
image: gitea/gitea:{{ gitea_version }}
environment:
- USER_UID=1000
- USER_GID=1000
- GITEA__database__DB_TYPE={{ gitea_db_type }}
- GITEA__database__HOST={{ gitea_db_host }}
- GITEA__database__NAME={{ gitea_db_name }}
- GITEA__database__USER={{ gitea_db_user }}
- GITEA__database__PASSWD={{ gitea_db_password }}
- GITEA__server__DOMAIN={{ gitea_domain }}
- GITEA__server__SSH_DOMAIN={{ gitea_domain }}
- GITEA__server__SSH_PORT={{ gitea_ssh_port }}
- GITEA__server__ROOT_URL=https://{{ gitea_domain }}/
volumes:
- {{ data_dir }}/data:/data
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
networks:
- internal
- {{ traefik_public_net }}
ports:
- "{{ gitea_ssh_port }}:22"
deploy:
mode: replicated
replicas: 1
labels:
- "traefik.enable=true"
- "traefik.docker.network={{ traefik_public_net }}"
- "traefik.http.routers.gitea.rule=Host(`{{ gitea_domain }}`)"
- "traefik.http.routers.gitea.entrypoints=https"
- "traefik.http.routers.gitea.tls.certresolver=main"
- "traefik.http.services.gitea.loadbalancer.server.port=3000"
db:
image: postgres:{{ postgres_version }}
restart: always
environment:
- POSTGRES_USER={{ gitea_db_user }}
- POSTGRES_PASSWORD={{ gitea_db_password }}
- POSTGRES_DB={{ gitea_db_name }}
networks:
- internal
volumes:
- {{ data_dir }}/data/db:/var/lib/postgresql/data
command:
- "postgres"
- "-c"
- "fsync=on"
- "-c"
- "full_page_writes=on"
- "-c"
- "synchronous_commit=on"
deploy:
mode: replicated
replicas: 1

View File

@ -0,0 +1,47 @@
---
- name: KESTRA | Ensure data directory
ansible.builtin.file:
path: '{{ data_dir }}/data/data'
state: directory
mode: '0755'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: KESTRA | Ensure db directory
ansible.builtin.file:
path: '{{ data_dir }}/data/db'
state: directory
mode: '0755'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: KESTRA | Konfigurationsdatei für tmpfiles.d erstellen
ansible.builtin.copy:
content: "d /tmp/kestra-wd 0755 root root -"
dest: /etc/tmpfiles.d/kestra-wd.conf
owner: root
group: root
mode: '0644'
- name: KESTRA | Create Kestra working directory
ansible.builtin.file:
path: /tmp/kestra-wd
state: directory
mode: '0755'
- name: KESTRA | Generate Compose file
ansible.builtin.template:
src: docker-compose.yml.j2
dest: '{{ data_dir }}/kestra.yml'
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: KESTRA | Deploy stack
community.docker.docker_stack:
state: present
name: kestra
compose:
- /mnt/cephfs/kestra/kestra.yml
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,92 @@
networks:
internal:
{{ traefik_public_net }}:
external: true
services:
postgres:
image: postgres:17
volumes:
- {{ data_dir }}/data/db:/var/lib/postgresql/data
environment:
POSTGRES_DB: {{ kestra.db.name }}
POSTGRES_USER: {{ kestra.db.user }}
POSTGRES_PASSWORD: "{{ kestra.db.pass }}"
healthcheck:
test: ["CMD-SHELL", "pg_isready -d '$${POSTGRES_DB}' -U $${POSTGRES_USER}"]
interval: 30s
timeout: 10s
retries: 10
networks:
- internal
deploy:
mode: replicated
replicas: 1
kestra:
image: kestra/kestra:v0.24.2
entrypoint: /bin/bash
# Note that this is meant for development only. Refer to the documentation for production deployments of Kestra which runs without a root user.
user: "root"
command:
- -c
- /app/kestra server standalone --worker-thread=128
volumes:
- {{ data_dir }}/data/data:/app/storage
- /var/run/docker.sock:/var/run/docker.sock
- /tmp/kestra-wd:/tmp/kestra-wd
environment:
KESTRA_CONFIGURATION: |
datasources:
postgres:
url: jdbc:postgresql://postgres:5432/kestra
driverClassName: org.postgresql.Driver
username: {{ kestra.db.user }}
password: {{ kestra.db.pass }}
kestra:
tutorialFlows:
enabled: false
traces:
root: DEFAULT
micronaut:
metrics:
export:
otlp:
enabled: true
url: http://signoz_otel-collector:4318/v1/metrics
otel:
traces:
exporter: otlp
exporter:
otlp:
endpoint: http://signoz_otel-collector:4318
server:
basic-auth:
username: {{ kestra.basic_auth.user }}
password: {{ kestra.basic_auth.pass }}
repository:
type: postgres
storage:
type: local
local:
base-path: "/app/storage"
queue:
type: postgres
tasks:
tmp-dir:
path: /tmp/kestra-wd/tmp
url: http://localhost:8080/
networks:
- {{ traefik_public_net }}
- internal
deploy:
mode: replicated
replicas: 1
labels:
- "traefik.enable=true"
- "traefik.swarm.network={{ traefik_public_net }}"
- "traefik.http.routers.kestra.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
- "traefik.http.routers.kestra.entrypoints=https"
- "traefik.http.routers.kestra.tls=true"
- "traefik.http.routers.kestra.tls.certresolver=main"
- "traefik.http.services.kestra.loadbalancer.server.port=8080"

View File

@ -0,0 +1,11 @@
subdomain: kestra
data_dir: "{{ ceph_volume }}/kestra"
kestra:
basic_auth:
user: "ma@coachhamburg.com"
pass: "igyozi9B87yTeiQ6z2sbe8Y4aQLJV58jdaCNu"
db:
name: kestra
user: kestra
pass: ""

View File

@ -0,0 +1,13 @@
---
- name: Copy Stack Files
copy:
directory_mode: true
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring
dest: /srv
- block:
- name: Deploy Monitoring stack
community.docker.docker_stack:
state: present
name: monitoring
compose:
- /srv/monitoring/observability.yml

View File

@ -0,0 +1,25 @@
---
- name: PORTAINER | Ensure data directories
ansible.builtin.file:
path: '{{ data_dir }}/data'
state: directory
mode: '0755'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: PORTAINER | Generate Compose file
ansible.builtin.template:
src: docker-compose.yml.j2
dest: '{{ data_dir }}/portainer.yml'
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: PORTAINER | Deploy stack
community.docker.docker_stack:
state: present
name: portainer
compose:
- '{{ data_dir }}/portainer.yml'
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,37 @@
version: '3.2'
services:
agent:
image: portainer/agent:{{ portainer_version }}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /var/lib/docker/volumes:/var/lib/docker/volumes
networks:
- {{ traefik_public_net }}
deploy:
mode: global
placement:
constraints: [node.platform.os == linux]
portainer:
image: portainer/portainer-ce:{{ portainer_version }}
command: -H tcp://portainer_agent:9001 --tlsskipverify
volumes:
- {{ data_dir }}/data:/data
networks:
- {{ traefik_public_net }}
deploy:
mode: replicated
replicas: 1
labels:
- "traefik.enable=true"
- "traefik.swarm.network={{ traefik_public_net }}"
- "traefik.http.routers.portainer.rule=Host(`{{ subdomain }}.{{ main_domain }}`)"
- "traefik.http.routers.portainer.entrypoints=https"
- "traefik.http.routers.portainer.tls=true"
- "traefik.http.routers.portainer.tls.certresolver=main"
- "traefik.http.services.portainer.loadbalancer.server.port=9000"
networks:
{{ traefik_public_net }}:
external: true

View File

@ -0,0 +1,4 @@
subdomain: port
data_dir: "{{ ceph_volume }}/portainer"
portainer_version: 2.33.5

View File

@ -0,0 +1,18 @@
---
- name: Copy Stack Files
copy:
directory_mode: true
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/signoz-infra
dest: /mnt/cephfs
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: Deploy Signoz Infra stack
community.docker.docker_stack:
state: present
name: signoz-infra
prune: true
compose:
- /mnt/cephfs/signoz-infra/signoz-infra.yml
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,18 @@
---
- name: Copy Stack Files
copy:
directory_mode: true
src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/signoz
dest: /mnt/cephfs
delegate_to: "{{ groups['managers'][0] }}"
run_once: true
- name: Deploy Signoz stack
community.docker.docker_stack:
state: present
name: signoz
prune: true
compose:
- /mnt/cephfs/signoz/signoz.yml
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,30 @@
---
- name: SSH | Ensure privilege separation directory exists
ansible.builtin.file:
path: /run/sshd
state: directory
mode: '0755'
- name: SSH | Root-Login nur mit Schlüssel erlauben
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^#?PermitRootLogin'
line: 'PermitRootLogin prohibit-password'
validate: 'sshd -t -f %s'
notify: restart sshd
- name: SSH | Passwort-Authentifizierung deaktivieren
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^#?PasswordAuthentication'
line: 'PasswordAuthentication no'
validate: 'sshd -t -f %s'
notify: restart sshd
- name: SSH | Leere Passwörter verbieten
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^#?PermitEmptyPasswords'
line: 'PermitEmptyPasswords no'
validate: 'sshd -t -f %s'
notify: restart sshd

View File

@ -0,0 +1,18 @@
[http]
[http.middlewares]
[http.middlewares.authentik.forwardAuth]
address = "http://authentik_server:9000/outpost.goauthentik.io/auth/traefik"
trustForwardHeader = true
authResponseHeaders = [
"X-authentik-username",
"X-authentik-groups",
"X-authentik-email",
"X-authentik-name",
"X-authentik-uid",
"X-authentik-jwt",
"X-authentik-meta-jwks",
"X-authentik-meta-outpost",
"X-authentik-meta-provider",
"X-authentik-meta-app",
"X-authentik-meta-version"
]

View File

@ -0,0 +1,80 @@
[global]
checkNewVersion = true
sendAnonymousUsage = false
[experimental]
otlpLogs = true
[core]
defaultRuleSyntax = "v2"
[accessLog]
filePath = "/logs/access.log"
format = "json"
# Enable the Dashboard
[api]
dashboard = true
# Write out Traefik logs
[log]
level = "INFO"
format = "json"
filePath = "/logs/traefik.log"
# [log.otlp.http]
# endpoint = "http://signoz_otel-collector:4318/v1/logs"
[entryPoints]
[entryPoints.http]
address = ":80"
[entryPoints.http.http.redirections.entryPoint]
to = "https"
scheme = "https"
[entryPoints.https]
address = ":443"
# [entryPoints.https.http.tls]
# certResolver = "main"
# OTel
# [tracing]
# serviceName = "traefik"
# [tracing.otlp.http]
# endpoint = "http://signoz_otel-collector:4318/v1/traces"
# [tracing.otlp.http.tls]
# insecureSkipVerify = true
# # Metrics
# [metrics]
# addInternals = false
# [metrics.otlp]
# serviceName = "traefik"
# addEntryPointsLabels = true
# addRoutersLabels = true
# addServicesLabels = true
# [metrics.otlp.http]
# endpoint = "http://signoz_otel-collector:4318/v1/metrics"
# [metrics.otlp.grpc]
# endpoint = "monitoring_alloy:4317"
# insecure = true
# Let's Encrypt
[certificatesResolvers.main.acme]
email = "ma@coachhamburg.com"
storage = "acme.json"
# uncomment to use staging CA for testing
# caServer = "https://acme-staging-v02.api.letsencrypt.org/directory"
# [certificatesResolvers.main.acme.tlsChallenge]
[certificatesResolvers.main.acme.dnsChallenge]
provider = "digitalocean"
# Uncomment to use HTTP validation, like a caveman!
# [certificatesResolvers.main.acme.httpChallenge]
# entryPoint = "http"
[providers]
[providers.swarm]
endpoint = "unix:///var/run/docker.sock"
exposedByDefault = false
[providers.file]
directory = "/etc/traefik/dynamic"
watch = true

View File

@ -0,0 +1,44 @@
---
- name: TRAEFIK | Copy Stack Files
copy:
directory_mode: true
src: traefik
dest: "{{ ceph_volume }}"
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: TRAEFIK | Generate Compose file
ansible.builtin.template:
src: docker-compose.yml.j2
dest: "{{ data_dir }}/traefik.yml"
mode: 0644
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: TRAEFIK | Ensure permissions on acme.json
ansible.builtin.file:
path: "{{ data_dir }}/data/acme.json"
mode: '0600'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: TRAEFIK | traefik_public Netzwerk erstellen
community.docker.docker_network:
name: traefik_public
driver: overlay
state: present
attachable: yes
ipam_config:
- subnet: '172.16.200.0/24'
gateway: '172.16.200.1'
run_once: true
delegate_to: "{{ groups['managers'][0] }}"
- name: TRAEFIK | Deploy app stack
community.docker.docker_stack:
state: present
name: traefik
compose:
- "{{ data_dir }}/traefik.yml"
delegate_to: "{{ groups['managers'][0] }}"
run_once: true

View File

@ -0,0 +1,55 @@
services:
app:
image: traefik:{{ traefik_version }}
ports:
- target: 80
published: 80
protocol: tcp
mode: host
- target: 443
published: 443
protocol: tcp
mode: host
- target: 8080
published: 8080
protocol: tcp
environment:
# - HETZNER_API_TOKEN={{ hetzner_api_key }}
- DO_AUTH_TOKEN={{ do_api_key }}
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- {{ data_dir }}/config:/etc/traefik
- {{ data_dir }}/data/logs:/logs
- {{ data_dir }}/data/acme.json:/acme.json
# healthcheck:
# test: ["CMD", "traefik", "healthcheck", "--ping"]
# timeout: 1s
# interval: 10s
# retries: 3
# start_period: 10s
networks:
- {{ traefik_public_net }}
# Global mode makes an instance of traefik listen on _every_ node, so that regardless of which
# node the request arrives on, it'll be forwarded to the correct backend service.
deploy:
mode: global
labels:
- "traefik.enable=true"
- "traefik.swarm.network={{ traefik_public_net }}"
- "traefik.http.routers.api.rule=Host(`{{ subdomain }}.{{ main_domain }}`) && (PathPrefix(`/api`) || PathPrefix(`/dashboard`))"
- "traefik.http.routers.api.entrypoints=https"
{% if use_authentik %}
- "traefik.http.routers.api.middlewares=authentik@file"
{% endif %}
- "traefik.http.routers.api.tls.domains[0].main={{ main_domain }}"
- "traefik.http.routers.api.tls.domains[0].sans=*.{{ main_domain }}"
- "traefik.http.routers.api.tls=true"
- "traefik.http.routers.api.tls.certresolver=main"
- "traefik.http.routers.api.service=api@internal"
- "traefik.http.services.dummy.loadbalancer.server.port=9999"
placement:
constraints: [node.role == manager]
networks:
{{ traefik_public_net }}:
external: true

View File

@ -0,0 +1,5 @@
subdomain: router
use_authentik: true
data_dir: "{{ ceph_volume }}/traefik"
traefik_version: v3.6.2

View File

@ -0,0 +1,84 @@
---
- name: FIREWALL | UFW auf Standardeinstellungen zurücksetzen
community.general.ufw:
state: reset
- name: FIREWALL | Standardmäßig allen ausgehenden Traffic erlauben
community.general.ufw:
direction: outgoing
policy: allow
- name: FIREWALL | Standardmäßig allen eingehenden Traffic blockieren
community.general.ufw:
direction: incoming
policy: deny
- name: FIREWALL | Eingehenden SSH-Traffic auf öffentlichem Interface erlauben
community.general.ufw:
rule: allow
port: "{{ ssh_port }}"
proto: tcp
interface: "{{ public_interface }}"
direction: in
- name: FIREWALL | Eingehenden SSH-Traffic auf privatem Interface erlauben
community.general.ufw:
rule: allow
port: "{{ ssh_port }}"
proto: tcp
interface: "{{ private_interface }}"
direction: in
- name: FIREWALL | Eingehenden HTTP/HTTPS-Traffic auf öffentlichem Interface erlauben
community.general.ufw:
rule: allow
port: "{{ item.port }}"
proto: "{{ item.proto }}"
interface: "{{ public_interface }}"
direction: in
with_items:
- { port: '80', proto: 'tcp' }
- { port: '443', proto: 'tcp' }
- name: FIREWALL | Ceph Monitor Ports auf privatem Interface erlauben
community.general.ufw:
rule: allow
port: "{{ item }}"
proto: tcp
interface: "{{ private_interface }}"
direction: in
with_items:
- '3300'
- '6789'
- name: FIREWALL | Ceph OSD/MGR Port-Range auf öffentlichem Interface erlauben
community.general.ufw:
rule: allow
port: "6800:7568"
proto: tcp
interface: "{{ private_interface }}"
direction: in
- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben
community.general.ufw:
rule: allow
port: "2377"
proto: tcp
interface: "{{ private_interface }}"
direction: in
- name: FIREWALL | Docker Swarm Discovery/Network Ports auf privatem Interface erlauben
community.general.ufw:
rule: allow
port: "{{ item.port }}"
proto: "{{ item.proto }}"
interface: "{{ private_interface }}"
direction: in
with_items:
- { port: '7946', proto: 'tcp' }
- { port: '7946', proto: 'udp' }
- { port: '4789', proto: 'udp' }
- name: FIREWALL | UFW aktivieren
community.general.ufw:
state: enabled

View File

@ -0,0 +1,3 @@
config:
hcloud:token:
secure: AAABAHkvxBXaEbrikY6bNyuwXehFp71LvsHTT2LOYHLiAaRCil5cSODn1EktYTYL+f4ryGJtN1j/wiyrAkbZBnyVC1QnSb84tTLYeKYXBtHo2fY87vReuyOwFZbFGylC

9
iac/cluster/Pulumi.yaml Normal file
View File

@ -0,0 +1,9 @@
name: gc-infra
description: A minimal Go Pulumi program
runtime: go
config:
pulumi:tags:
value:
pulumi:template: go
# hcloud:token:
# value: xqb89P4vF2YlBjU75AAtyoQzNvTHaXyhB0J2UYR8dAmEQDKz5GWeKO7KgEyPzUu5

View File

@ -16,7 +16,7 @@ import (
type Infrastructure struct {
placementGroup *hcloud.PlacementGroup
networkID *pulumi.IDOutput
masterNodes []*hcloud.Server
managerNodes []*hcloud.Server
workerNodes []*hcloud.Server
}
@ -55,30 +55,31 @@ func main() {
panic(err.Error())
}
infra.masterNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
infra.managerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
PlacementGroupId: infra.placementGroup.ID(),
NetworkId: infra.networkID,
NetworkFirstIP: string(utils.IncrementIP(net.ParseIP("10.0.1.0"))),
Basename: "master-node",
Count: 1,
SshKey: hkey,
})
if err != nil {
panic(err.Error())
}
infra.workerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
PlacementGroupId: infra.placementGroup.ID(),
NetworkId: infra.networkID,
NetworkFirstIP: string(utils.IncrementIP(net.ParseIP("10.0.1.20"))),
Basename: "worker-node",
Count: 2,
Basename: "manager-node",
Count: 3,
SshKey: hkey,
ServerType: "ccx23",
})
if err != nil {
panic(err.Error())
}
// infra.workerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{
// PlacementGroupId: infra.placementGroup.ID(),
// NetworkId: infra.networkID,
// NetworkFirstIP: string(utils.IncrementIP(net.ParseIP("10.0.1.20"))),
// Basename: "worker-node",
// Count: 2,
// SshKey: hkey,
// })
// if err != nil {
// panic(err.Error())
// }
for idx, s := range slices.Concat(infra.masterNodes, infra.workerNodes) {
for idx, s := range slices.Concat(infra.managerNodes, infra.workerNodes) {
err := utils.InstallAnsibleDependencies(ctx, remote.ConnectionArgs{
Host: s.Ipv4Address,
User: pulumi.String("root"),
@ -89,22 +90,28 @@ func main() {
}
}
var advAddr = infra.masterNodes[0].Networks.ApplyT(func(net []hcloud.ServerNetworkType) string {
return *net[0].Ip
}).(pulumi.StringOutput)
// var advAddr = infra.managerNodes[0].Networks.ApplyT(func(net []hcloud.ServerNetworkType) string {
// return *net[0].Ip
// }).(pulumi.StringOutput)
tokens, err := utils.InitDockerSwarm(ctx, remote.ConnectionArgs{
Host: infra.masterNodes[0].Ipv4Address,
User: pulumi.String("root"),
PrivateKey: pk.PrivateKeyOpenssh}, advAddr)
if err != nil {
panic(err.Error())
}
// tokens, err := utils.InitDockerSwarm(ctx, remote.ConnectionArgs{
// Host: infra.managerNodes[0].Ipv4Address,
// User: pulumi.String("root"),
// PrivateKey: pk.PrivateKeyOpenssh}, advAddr)
// if err != nil {
// panic(err.Error())
// }
ctx.Export("SwarmTokens", tokens)
// ctx.Export("SwarmTokens", tokens)
// inventory, err := utils.CreateAnsibleInventory(infra.managerNodes, infra.workerNodes)
// if err != nil {
// panic(err.Error())
// }
// ctx.Export("inventory", inventory)
sm := map[string]pulumi.Input{}
for idx, s := range slices.Concat(infra.masterNodes, infra.workerNodes) {
for idx, s := range slices.Concat(infra.managerNodes, infra.workerNodes) {
sm[fmt.Sprintf("node-%d-ip", idx)] = s.Ipv4Address
}
ctx.Export("server-ips", pulumi.Map(sm))

View File

@ -1,11 +1,14 @@
package utils
import (
"bytes"
"fmt"
"regexp"
"strings"
"text/template"
"github.com/pulumi/pulumi-command/sdk/go/command/remote"
"github.com/pulumi/pulumi-hcloud/sdk/go/hcloud"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
@ -14,6 +17,11 @@ type SwarmJoinTokens struct {
WorkerToken string
}
type ServerInfo struct {
Name pulumi.StringOutput
IP pulumi.StringOutput
}
func InstallAnsibleDependencies(ctx *pulumi.Context, connArgs remote.ConnectionArgs, uniqueness string) error {
_, err := remote.NewCommand(ctx, strings.Join([]string{uniqueness, "Install Ansible Dependencies"}, ": "),
&remote.CommandArgs{
@ -26,7 +34,7 @@ func InstallAnsibleDependencies(ctx *pulumi.Context, connArgs remote.ConnectionA
return nil
}
func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advertiseAddr pulumi.StringOutput) (pulumi.StringOutput, error) {
func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advertiseAddr pulumi.StringOutput) (pulumi.Output, error) {
var tokens SwarmJoinTokens
fullCommand := advertiseAddr.ApplyT(func(addr string) *string {
@ -44,17 +52,103 @@ func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advert
return pulumi.StringOutput{}, err
}
return out.Stdout.ApplyT(func(output string) string {
return out.Stdout.ApplyT(func(output string) SwarmJoinTokens {
searchWorker := "Worker Token: "
pattern := regexp.MustCompile(searchWorker + `(\S+)`)
patternWorker := regexp.MustCompile(searchWorker + `(\S+)`)
searchManager := "Manager Token: "
patternManager := regexp.MustCompile(searchManager + `(\S+)`)
matches := pattern.FindStringSubmatch(output)
matches := patternWorker.FindStringSubmatch(output)
if len(matches) > 1 {
extracted := matches[1]
tokens.WorkerToken = extracted
return extracted
}
fmt.Println(tokens.WorkerToken)
return ""
}).(pulumi.StringOutput), nil
matches = patternManager.FindStringSubmatch(output)
if len(matches) > 1 {
extracted := matches[1]
tokens.ManagerToken = extracted
}
return tokens
}), nil
}
func CreateAnsibleInventory(managerNodes, workerNodes []*hcloud.Server) (pulumi.Output, error) {
serverInfos := toServerInfo(managerNodes)
return pulumi.All(pulumi.ToOutput(serverInfos)).ApplyT(func(results []interface{}) (string, error) {
var serverInfos = results[0].([]ServerInfo)
// var workerSlice = results[1].([]*hcloud.Server)
serverData := make(map[string][]ServerInfo)
for _, s := range serverInfos {
serverData["Manager"] = append(serverData["Manager"], ServerInfo{
Name: s.Name,
IP: s.IP,
})
}
// for _, result := range workerSlice {
// server := result.(map[string]interface{})
// serverData["Worker"] = append(serverData["Worker"], ServerInfo{
// Name: server["name"].(string),
// IP: server["ipv4_address"].(string),
// })
// }
fmt.Println(serverData["Manager"])
fmt.Println(results[0])
return generateInventoryFile(serverData)
}).(pulumi.Output), nil
}
func toServerInfo(server []*hcloud.Server) pulumi.ArrayOutput {
serverInfo := []ServerInfo{}
for _, s := range server {
serverInfo = append(serverInfo, ServerInfo{
Name: s.Name,
IP: s.Ipv4Address,
})
}
return pulumi.All(serverInfo).ApplyT(func(args []interface{}) []interface{} {
var serverInfo []interface{}
for _, s := range args {
val := s.(map[string]interface{})
serverInfo = append(serverInfo, map[string]interface{}{
"Name": val["Name"].(string),
"IP": val["IP"].(string),
})
}
return serverInfo
}).(pulumi.ArrayOutput)
}
func generateInventoryFile(inventory map[string][]ServerInfo) (string, error) {
const inventoryTmpl = `
[all]
{{ range .Manager }}
{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
{{ end }}
{{ range .Worker }}
{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
{{ end }}
[manager]
{{ range .Manager }}
{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
{{ end }}
[worker]
{{ range .Worker }}
{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key
{{ end }}
`
tmpl, err := template.New("inventory").Parse(inventoryTmpl)
if err != nil {
return "", err
}
var buf bytes.Buffer
err = tmpl.Execute(&buf, inventory)
if err != nil {
return "", err
}
return buf.String(), nil
}

View File

@ -54,6 +54,7 @@ type CreateServerArgs struct {
Basename string
Count int
SshKey *hcloud.SshKey
ServerType string
}
func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server, error) {
@ -64,9 +65,8 @@ func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server,
s, err := hcloud.NewServer(ctx, sn, &hcloud.ServerArgs{
Name: pulumi.String(sn),
Image: pulumi.String("docker-ce"),
ServerType: pulumi.String("cpx21"),
Location: pulumi.StringPtr("fsn1"),
// Datacenter: pulumi.StringPtr("fsn1"),
ServerType: pulumi.String(cfg.ServerType),
Location: pulumi.StringPtr("hel1"),
Networks: hcloud.ServerNetworkTypeArray{
&hcloud.ServerNetworkTypeArgs{
NetworkId: IDtoIntOutput(cfg.NetworkId),
@ -85,6 +85,24 @@ func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server,
if err != nil {
return nodes, err
}
cephVolume, err := hcloud.NewVolume(ctx, fmt.Sprintf("ceph-%s", sn), &hcloud.VolumeArgs{
Name: pulumi.Sprintf("%s-ceph-vol-0%d", s.Name, i+1),
Size: pulumi.Int(100),
Location: s.Location,
})
if err != nil {
return nodes, fmt.Errorf("couldn't create volume: %w", err)
}
_, err = hcloud.NewVolumeAttachment(ctx, fmt.Sprintf("ceph-vol-attach-%s", sn), &hcloud.VolumeAttachmentArgs{
VolumeId: IDtoIntOutput(cephVolume.ID()),
ServerId: IDtoIntOutput(s.ID()),
})
if err != nil {
return nodes, fmt.Errorf("couldn't attach volume to node %d", i)
}
nodes = append(nodes, s)
nextIp = IncrementIP(net.ParseIP(nextIp)).String()
}