From 1813777146401d0dcebe013ae63165341e0a84d9 Mon Sep 17 00:00:00 2001 From: Danijel Simeunovic Date: Mon, 9 Feb 2026 09:42:50 +0100 Subject: [PATCH] cleanup --- APPLICATIONSET_GUIDE.md | 290 ------------------------------- ARGOCD_COMPREHENSIVE_ANALYSIS.md | 245 -------------------------- bootstrap.sh | 2 +- 3 files changed, 1 insertion(+), 536 deletions(-) delete mode 100644 APPLICATIONSET_GUIDE.md delete mode 100644 ARGOCD_COMPREHENSIVE_ANALYSIS.md diff --git a/APPLICATIONSET_GUIDE.md b/APPLICATIONSET_GUIDE.md deleted file mode 100644 index e41cb1b..0000000 --- a/APPLICATIONSET_GUIDE.md +++ /dev/null @@ -1,290 +0,0 @@ -# ApplicationSet Guide - -## Overview - -This repository uses **ApplicationSet** (instead of the traditional app-of-apps pattern) to manage infrastructure applications. ApplicationSet automatically discovers and creates ArgoCD Applications based on a git directory pattern. - -## How It Works - -### ApplicationSet Definition -Located in `argocd/_app-of-apps.yaml`: - -```yaml -apiVersion: argoproj.io/v1alpha1 -kind: ApplicationSet -metadata: - name: infrastructure-apps - namespace: argocd -spec: - goTemplate: true - generators: - - git: - repoURL: https://github.com/snothub/sturdy-adventure.git - revision: HEAD - directories: - - path: argocd/infra/*.yaml - template: - # Template applied to each discovered file -``` - -### Key Components - -1. **Generator**: Scans `argocd/infra/*.yaml` for all Application manifests -2. **Path Variables**: Each match provides template variables: - - `{{ .path.basenameNormalized }}` - filename without extension (e.g., "prometheus") - - `{{ .path.dir }}` - directory path (e.g., "argocd/infra") -3. **Template**: Used to create consistent Applications from the discovered files - -## How Applications Are Created - -When you create a new file `argocd/infra/my-app.yaml`: - -```yaml -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: my-app - # ... other config -spec: - # ... your application spec -``` - -The ApplicationSet: -1. Discovers the file matching `argocd/infra/*.yaml` -2. Extracts metadata: `basenameNormalized=my-app`, `dir=argocd/infra` -3. Renders the template with these variables -4. Creates an Application resource with: - - Name: `my-app` - - Labels applied from the template - - Sync policy inherited from template - - Source path: `argocd/infra` - -## Adding New Applications - -### Step 1: Create Application YAML -Create `argocd/infra/my-new-app.yaml`: - -```yaml -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: my-new-app - annotations: - argocd.argoproj.io/sync-wave: "2" # Optional: control sync ordering -spec: - project: default - source: - repoURL: https://my-helm-repo.com - chart: my-chart - targetRevision: "1.0.0" - helm: - releaseName: my-new-app - destination: - server: https://kubernetes.default.svc - namespace: my-namespace - # Note: syncPolicy is applied from the ApplicationSet template - # No need to duplicate it here -``` - -### Step 2: Push to Repository -```bash -git add argocd/infra/my-new-app.yaml -git commit -m "Add my-new-app infrastructure application" -git push -``` - -### Step 3: Verify Discovery -```bash -# View all generated applications -kubectl get applications -n argocd - -# Check ApplicationSet status -kubectl describe applicationset infrastructure-apps -n argocd - -# Watch for your new application -kubectl get application my-new-app -n argocd -w -``` - -## Template Variables - -The ApplicationSet template uses these variables for each discovered file: - -| Variable | Example | Description | -|----------|---------|-------------| -| `{{ .path.basenameNormalized }}` | `prometheus` | Filename without extension | -| `{{ .path.dir }}` | `argocd/infra` | Directory containing the file | -| `{{ .path.path }}` | `argocd/infra/prometheus.yaml` | Full path to the file | - -## ApplicationSet Features Used - -### 1. Git Directory Generator -```yaml -generators: -- git: - repoURL: https://github.com/snothub/sturdy-adventure.git - revision: HEAD - directories: - - path: argocd/infra/*.yaml # Match all YAML files -``` - -Automatically discovers applications in the git repository. - -### 2. Go Templating -```yaml -goTemplate: true -``` - -Enables Go template syntax for variable interpolation (e.g., `{{ .path.basenameNormalized }}`). - -### 3. Dynamic Application Generation -```yaml -template: - metadata: - name: "{{ .path.basenameNormalized }}" - labels: - app.kubernetes.io/name: "{{ .path.basenameNormalized }}" -``` - -Each discovered file generates an Application with consistent configuration. - -## Benefits Over App-of-Apps Pattern - -| Feature | ApplicationSet | App-of-Apps | -|---------|---|---| -| Auto-discovery | ✅ Automatic | ❌ Manual list required | -| New app onboarding | 1 file created | 1 file created + parent update | -| Consistency | ✅ Template enforced | ❌ Manual consistency | -| Scalability | ✅ Grows automatically | ❌ Manual maintenance | -| RBAC per app | ✅ Supported | ❌ Limited | -| Drift detection | ✅ Per app | ✅ Per app | - -## Sync Policy Applied by ApplicationSet - -All generated Applications inherit this sync policy from the template: - -```yaml -syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - CreateNamespace=true - - Validate=true - - ServerSideApply=true - timeout: 300s - retry: - limit: 5 - backoff: - duration: 5s - factor: 2 - maxDuration: 3m -``` - -**What this means:** -- ✅ Automatic syncing when repository changes -- ✅ Automatic pruning of deleted resources -- ✅ Self-healing if cluster drifts from git -- ✅ Namespace auto-creation if missing -- ✅ Manifest validation before applying -- ✅ Server-side apply for field ownership -- ✅ 5-minute timeout per application sync -- ✅ Up to 5 retry attempts with exponential backoff - -## Monitoring ApplicationSet - -### View ApplicationSet Status -```bash -kubectl describe applicationset infrastructure-apps -n argocd -``` - -Output shows: -- Condition status (Healthy, Progressing, etc.) -- Number of applications created -- Last sync time -- Error messages if any - -### View Generated Applications -```bash -# All applications created by this ApplicationSet -kubectl get applications -n argocd -l app.kubernetes.io/managed-by=argocd - -# View a specific application -kubectl get application prometheus -n argocd -o yaml -``` - -### Check Sync Status -```bash -# Watch all applications -kubectl get applications -n argocd -w - -# Detailed sync status -argocd app list - -# Application health -kubectl get applications -n argocd -o wide -``` - -## Troubleshooting - -### ApplicationSet Not Creating Applications - -**Problem**: Files in `argocd/infra/` but no Applications created - -**Solutions**: -1. Check file naming: Must end with `.yaml` (not `.yml`) -2. Verify path: Files must be in `argocd/infra/` (not subdirectories) -3. Check permissions: Repository URL must be accessible -4. Review ApplicationSet status: `kubectl describe applicationset infrastructure-apps -n argocd` - -### Application Created But Not Syncing - -**Problem**: Application exists but stays in "OutOfSync" state - -**Solutions**: -1. Check application spec is valid YAML -2. Verify `destination.server` is accessible: `https://kubernetes.default.svc` -3. Check `destination.namespace` exists or `CreateNamespace=true` is set -4. Review application logs: `kubectl logs -n argocd deployment/argocd-application-controller` - -### ApplicationSet Generates Duplicate Applications - -**Problem**: Same application created multiple times - -**Solutions**: -1. Check for duplicate files in `argocd/infra/` with same name -2. Clear git cache: `git clean -fd` -3. ApplicationSet may take time to reconcile; wait 60 seconds - -## Best Practices - -1. **File Organization** - - Keep only Application manifests in `argocd/infra/` - - Don't mix other resource types (ConfigMaps, Secrets, etc.) - - Use consistent naming: `lowercase-with-hyphens.yaml` - -2. **Application Design** - - Keep each Application spec in the discovered file - - Don't rely on syncPolicy being defined in the Application (it comes from ApplicationSet) - - Use sync waves for dependency ordering - -3. **Repository Management** - - All files in `argocd/infra/` should be valid Kubernetes manifests - - Regular commits to track changes - - Use branches for testing new applications before merging - -4. **Monitoring** - - Regularly check ApplicationSet status - - Monitor generated applications for sync status - - Set up alerts for Failed or Degraded applications - -5. **Updates** - - Update application specs directly in `argocd/infra/` files - - ApplicationSet changes take effect within 60 seconds - - Test in dev environment first - -## Related Documentation - -- [ArgoCD ApplicationSet Docs](https://argocd-applicationset.readthedocs.io/) -- [ArgoCD Application Spec](https://argo-cd.readthedocs.io/en/stable/operator-manual/declarative-setup/#applications) -- [Kubernetes Application Convention](https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#recommended-labels) - diff --git a/ARGOCD_COMPREHENSIVE_ANALYSIS.md b/ARGOCD_COMPREHENSIVE_ANALYSIS.md deleted file mode 100644 index ddeaef3..0000000 --- a/ARGOCD_COMPREHENSIVE_ANALYSIS.md +++ /dev/null @@ -1,245 +0,0 @@ -# ArgoCD Applications Comprehensive Analysis Report - -## Overview -Analyzed 11 ArgoCD Application manifests in `/argocd/apps/`. This report details current configurations, risks, best practice violations, security concerns, and operational improvements. - ---- - -## Critical Issues Summary - -### 1. Hardcoded Secrets (CRITICAL) -**Files:** grafana.yaml -- **grafana.yaml:** Admin password "forte" in plaintext -- **Impact:** Credentials exposed in Git history forever -- **Fix:** Migrate to Sealed Secrets immediately - -### 2. Floating Versions (CRITICAL) -**Files:** cluster-resources-application.yaml -- Using `HEAD` instead of tagged versions -- No audit trail of deployments -- Unpredictable application behavior -- **Fix:** Pin to specific git tags or commit SHAs - -### 3. Undersized Resources (HIGH) -**Files:** cert-manager, loki, prometheus, trivy -- cert-manager: 100m CPU limit (too tight for control plane) -- loki: 200m CPU, 512Mi memory (drops logs under load) -- fluent-bit: 100m CPU for all-node log collection -- **Impact:** Performance degradation, OOM kills, dropped logs -- **Fix:** Increase resource limits across all monitoring stack - -### 4. No Data Persistence (HIGH) -**Files:** loki.yaml (filesystem storage), prometheus.yaml -- Loki using filesystem storage (ephemeral, lost on restart) -- Prometheus likely ephemeral (no PVC visible) -- No backup strategy -- **Fix:** Configure persistent volumes with cloud storage - ---- - -## Application-by-Application Summary - -| Application | Issues | Priority | Key Recommendation | -|-------------|--------|----------|---------------------| -| **cert-manager** | Undersized (100m), single replica, tight webhook timeout | HIGH | Increase CPU to 500m, add replicas (2-3), longer timeout | -| **cluster-resources** | Floating HEAD, RBAC missing | MEDIUM | Pin version, restrict with AppProject | -| **fluent-bit** | Placeholder URL, tight CPU (100m), HTTP server wide open | HIGH | Update repo URL, 200m CPU, restrict HTTP to localhost | -| **grafana** | Hardcoded password, placeholder URL, no persistence | CRITICAL | Sealed Secrets, update URL, add PVC | -| **kyverno** | No policies configured, no resources, no failures policies | MEDIUM | Add security policies, define resource limits | -| **loki** | Filesystem storage, no auth, single binary, tight resources | CRITICAL | S3/GCS storage, enable auth, distributed mode | -| **prometheus** | No alertmanager, service port 80, no persistence, no ingress | HIGH | Enable alertmanager, port 9090, add PVC, secure ingress | -| **sealed-secrets** | No backup procedure, single replica, no resources | MEDIUM | Document key backup, add PDB, increase replicas | -| **traefik** | TLS incomplete, LoadBalancer cloud-specific, no resources | MEDIUM | Complete TLS config, add cert-manager integration, resources | -| **trivy** | Alpha version (v0.0.7), ignoreUnfixed hides vulns, no resources | MEDIUM | Upgrade to stable (v0.3+), show all vulns, resources | - ---- - -## Cross-Cutting Issues - -### RBAC & Security (Critical) -- All apps use default project (no boundaries) -- No explicit AppProject configuration -- Cluster resources not restricted -- **Fix:** Create AppProject with granular permissions - -### No Network Policies (All Namespaces) -- Unlimited pod-to-pod communication -- Monitoring stack accessible from all pods -- **Fix:** Implement NetworkPolicy for each namespace - -### No Pod Disruption Budgets -- No HA guarantees during cluster operations -- Critical services can be evicted/disrupted -- **Fix:** Add PDB minAvailable: 1 for critical apps - -### Incomplete TLS Configuration -- Prometheus on HTTP port 80 -- Traefik TLS uses defaults (unclear) -- Fluent-bit to Loki unencrypted -- **Fix:** Implement TLS end-to-end with cert-manager - -### Missing Resource Requests -- Prometheus, Traefik, Kyverno undefined -- Scheduler can overallocate resources -- **Fix:** Add requests/limits to all remaining apps - ---- - -## Priority Remediation Roadmap - -### Phase 1: CRITICAL (Immediate) -- [ ] Migrate Grafana admin password to Sealed Secrets -- [ ] Update placeholder repository URLs -- [ ] Pin floating versions (HEAD → git tags) - -### Phase 2: URGENT (Week 1-2) -- [ ] Configure persistent storage for Loki -- [ ] Configure persistent storage for Prometheus -- [ ] Enable Prometheus Alertmanager -- [ ] Increase resource limits for all apps - -### Phase 3: IMPORTANT (Week 2-3) -- [ ] Implement NetworkPolicies -- [ ] Create AppProject with RBAC -- [ ] Add PodDisruptionBudgets -- [ ] Configure Kyverno security policies - -### Phase 4: ENHANCEMENT (Week 3-4) -- [ ] Complete TLS configuration -- [ ] Implement cert-manager integration -- [ ] Setup backup strategies -- [ ] Add comprehensive monitoring - ---- - -## Detailed Issues by Category - -### Resource Configuration -- **cert-manager:** 50m req, 100m limit (INCREASE to 250m/500m) -- **prometheus:** 250m req, 500m limit (ADEQUATE, but add to values) -- **grafana:** 100m req, 200m limit (INCREASE to 200m/400m) -- **loki:** 100m req, 200m limit (INCREASE to 200m/500m for distributed) -- **fluent-bit:** 50m req, 100m limit (INCREASE to 100m/200m) -- **traefik:** Not specified (INCREASE to 250m/500m, 256Mi/512Mi) -- **kyverno:** Not specified (ADD 100m/200m, 128Mi/256Mi) -- **trivy:** Not specified (ADD 250m/500m, 256Mi/512Mi) -- **sealedsecrets:** Not specified (ADD 100m/200m, 128Mi/256Mi) - -### Storage & Persistence -- **loki:** Filesystem (CRITICAL - switch to S3/GCS) -- **prometheus:** Implicit ephemeral (ADD PVC 20-30GB) -- **grafana:** No persistence specified (QUESTIONABLE - OK for dashboards if imported) -- **sealed-secrets:** Key backup not documented (ADD backup procedure) - -### High Availability -- **cert-manager:** replicaCount: 1 (INCREASE to 2-3) -- **sealed-secrets:** Implicit single replica (INCREASE to 2-3) -- **traefik:** Replicas: 2 (ADEQUATE, but add PDB) -- **monitoring stack:** Single instances (CONSIDER distributed) - -### Security Gaps -- **Secrets in Git:** Grafana -- **No Authentication:** Loki (auth_enabled: false), Prometheus (open HTTP) -- **Wide Permissions:** kubectl RBAC not restricted (ADD ClusterRole) -- **No Network Policies:** All apps (ADD NetworkPolicy) -- **TLS Incomplete:** Prometheus HTTP 80, Traefik TLS {}, Fluent→Loki HTTP - ---- - -## Key Statistics - -| Metric | Count | -|--------|-------| -| Total Applications Analyzed | 11 | -| Critical Issues | 5 | -| High Priority Issues | 12 | -| Medium Priority Issues | 20+ | -| Best Practice Violations | 30+ | -| Security Concerns | 25+ | -| Apps Missing Resource Requests | 4 | -| Apps Missing Resource Limits | 3 | -| Apps Using Floating Versions | 2 | -| Apps with Hardcoded Secrets | 2 | -| Apps Requiring Persistence | 3 | -| Apps with Single Replica Critical Services | 4 | - ---- - -## Implementation Guidance - -### Sealed Secrets Setup -```bash -# Install sealed-secrets controller -kubectl apply -f ./argocd/apps/sealedsecrets.yaml - -# Seal grafana password -echo -n "new-secure-password" | kubectl create secret generic grafana-admin \ - --dry-run=client --from-file=password=/dev/stdin -o yaml | \ - kubeseal -f - > grafana-sealed-secret.yaml - -# Update application manifests to reference sealed secrets -``` - -### Persistent Volume for Loki -```yaml -# Add to loki values -persistence: - enabled: true - storageClassName: "fast" - size: 50Gi - accessModes: - - ReadWriteOnce -``` - -### AppProject for RBAC -```yaml -apiVersion: argoproj.io/v1alpha1 -kind: AppProject -metadata: - name: platform -spec: - destinations: - - namespace: '*' - server: 'https://kubernetes.default.svc' - sourceRepos: - - 'https://github.com/snothub/*' - roles: - - name: admin - policies: - - p, proj:platform:admin, applications, *, */*, allow -``` - -### NetworkPolicy for Monitoring -```yaml -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: monitoring-access - namespace: monitoring -spec: - podSelector: - matchLabels: - app: prometheus - policyTypes: - - Ingress - ingress: - - from: - - podSelector: - matchLabels: - app: grafana - ports: - - protocol: TCP - port: 9090 -``` - ---- - -## Next Steps - -1. **Review this analysis** with your team -2. **Create tickets** for each critical/high issue -3. **Schedule remediation** according to roadmap -4. **Document changes** as they're made -5. **Test thoroughly** in dev/staging first -6. **Monitor impact** after production changes - diff --git a/bootstrap.sh b/bootstrap.sh index f17451b..95836a6 100644 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -36,7 +36,7 @@ ArgoCd() helm upgrade --install argocd argo-cd \ --repo https://argoproj.github.io/argo-helm \ --namespace argocd --create-namespace \ - --values argocd/values/argocd-values.yaml \ + --values infra/values/argocd-values.yaml \ --timeout 60s --atomic }