From 5ed60d85b0e82c4ef84169ba8781529ad2db3b64 Mon Sep 17 00:00:00 2001
From: lizhukun777 <lizhukun777@gmail.com>
Date: Sat, 29 Nov 2025 10:39:24 +0800
Subject: [PATCH 1/2] feat: add semantic snapshot example

---
 semantic-snapshot/go-semantic-snapshot-v4.py | 554 +++++++++++++
 semantic-snapshot/py-semantic-snapshot-v3.py | 795 +++++++++++++++++++
 2 files changed, 1349 insertions(+)
 create mode 100644 semantic-snapshot/go-semantic-snapshot-v4.py
 create mode 100644 semantic-snapshot/py-semantic-snapshot-v3.py

diff --git a/semantic-snapshot/go-semantic-snapshot-v4.py b/semantic-snapshot/go-semantic-snapshot-v4.py
new file mode 100644
index 0000000..1b86b81
--- /dev/null
+++ b/semantic-snapshot/go-semantic-snapshot-v4.py
@@ -0,0 +1,554 @@
+# -*- coding: utf-8 -*-
+"""
+go-semantic-snapshot.py
+Go 项目语义快照生成器 (v2.0) - 混合策略 (AST + Regex) & 极致 Token 压缩
+
+主要特性：
+1. 混合解析策略：优先调用系统 `go` 命令进行 AST 精确解析，失败自动回退到 Regex。
+2. 深度语义分析：包含循环复杂度(CCN)、错误处理热点、并发模式(Goroutine/Channel)、泛型支持。
+3. 极致 Token 压缩：使用缩写键名 (n, p, r, cx...)，结构紧凑，专为 LLM Context Window 优化。
+
+使用示例：
+    python go-semantic-snapshot.py ./my-go-project
+    python go-semantic-snapshot.py ./my-go-project -o digest.yaml --graph
+"""
+
+from __future__ import unicode_literals, print_function
+import os
+import re
+import sys
+import subprocess
+import json
+import tempfile
+import shutil
+from collections import defaultdict
+
+try:
+    import yaml  # pip install PyYAML
+except ImportError:
+    print("Error: PyYAML not installed. Please run: pip install PyYAML")
+    sys.exit(1)
+
+# Python 2/3 compatibility
+if sys.version_info[0] == 2:
+    text_type = unicode  # noqa: F821
+else:
+    text_type = str
+
+# ---------------------------------------------------------
+# 嵌入式 Go AST 解析器源码 (用于精确分析)
+# ---------------------------------------------------------
+GO_AST_PARSER_SRC = r"""
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"os"
+	"strings"
+)
+
+// 压缩键名结构定义
+type Node struct {
+	Name       string      `json:"n,omitempty"`  // Name
+	Type       string      `json:"t,omitempty"`  // Type / Signature
+	Params     string      `json:"p,omitempty"`  // Params
+	Returns    string      `json:"r,omitempty"`  // Returns
+	Receiver   string      `json:"rc,omitempty"` // Receiver
+	Fields     []*Node     `json:"fd,omitempty"` // Fields
+	Methods    []*Node     `json:"md,omitempty"` // Interface Methods
+	Complexity int         `json:"cx,omitempty"` // Cyclomatic Complexity
+	IsGeneric  bool        `json:"gn,omitempty"` // Uses Generics
+}
+
+type FileSummary struct {
+	Package   string             `json:"pk"`
+	Imports   []string           `json:"im,omitempty"`
+	Structs   []*Node            `json:"st,omitempty"`
+	Ifaces    []*Node            `json:"if,omitempty"`
+	Funcs     []*Node            `json:"fn,omitempty"`
+	Methods   map[string][]*Node `json:"md,omitempty"` // Key: Receiver Type
+	Vars      []string           `json:"vr,omitempty"`
+	Consts    []string           `json:"cn,omitempty"`
+	Comments  []string           `json:"cm,omitempty"` // Sampled comments
+	Stats     FileStats          `json:"stat"`
+}
+
+type FileStats struct {
+	Goroutines int `json:"gr,omitempty"` // count of 'go func'
+	Channels   int `json:"ch,omitempty"` // count of channel ops
+	Errors     int `json:"er,omitempty"` // count of 'if err != nil'
+}
+
+func main() {
+	if len(os.Args) < 2 {
+		os.Exit(1)
+	}
+	fset := token.NewFileSet()
+	node, err := parser.ParseFile(fset, os.Args[1], nil, parser.ParseComments)
+	if err != nil {
+		os.Exit(1)
+	}
+
+	summary := FileSummary{
+		Package: node.Name.Name,
+		Methods: make(map[string][]*Node),
+	}
+
+	// 提取 Imports
+	for _, imp := range node.Imports {
+		path := strings.Trim(imp.Path.Value, "\"")
+		summary.Imports = append(summary.Imports, path)
+	}
+
+	// 访问 AST
+	ast.Inspect(node, func(n ast.Node) bool {
+		switch x := n.(type) {
+		
+		// 统计并发与错误处理
+		case *ast.GoStmt:
+			summary.Stats.Goroutines++
+		case *ast.SendStmt:
+			summary.Stats.Channels++
+		case *ast.IfStmt:
+			// 简单的 heuristic 检测 if err != nil
+			if binExpr, ok := x.Cond.(*ast.BinaryExpr); ok {
+				if x, ok := binExpr.X.(*ast.Ident); ok && x.Name == "err" {
+					summary.Stats.Errors++
+				}
+			}
+
+		case *ast.FuncDecl:
+			fnNode := &Node{
+				Name:       x.Name.Name,
+				Complexity: calcComplexity(x.Body),
+				IsGeneric:  x.Type.TypeParams != nil,
+			}
+			// 参数与返回值签名
+			fnNode.Params, fnNode.Returns = extractSig(x.Type)
+
+			if x.Recv == nil {
+				summary.Funcs = append(summary.Funcs, fnNode)
+			} else {
+				// 方法
+				recvType := formatType(x.Recv.List[0].Type)
+				// 清理指针符号以便分组
+				rawRecv := strings.TrimLeft(recvType, "*")
+				fnNode.Receiver = recvType
+				summary.Methods[rawRecv] = append(summary.Methods[rawRecv], fnNode)
+			}
+
+		case *ast.GenDecl:
+			if x.Tok == token.TYPE {
+				for _, spec := range x.Specs {
+					typeSpec := spec.(*ast.TypeSpec)
+					tNode := &Node{Name: typeSpec.Name.Name}
+					
+					// 泛型检测
+					if typeSpec.TypeParams != nil {
+						tNode.IsGeneric = true
+					}
+
+					switch t := typeSpec.Type.(type) {
+					case *ast.StructType:
+						// 提取 Struct 字段 (限制前10个)
+						count := 0
+						for _, field := range t.Fields.List {
+							if count > 10 { break }
+							typeStr := formatType(field.Type)
+							if len(field.Names) == 0 {
+								// 嵌入字段
+								tNode.Fields = append(tNode.Fields, &Node{Type: typeStr})
+							} else {
+								for _, name := range field.Names {
+									tNode.Fields = append(tNode.Fields, &Node{Name: name.Name, Type: typeStr})
+								}
+							}
+							count++
+						}
+						summary.Structs = append(summary.Structs, tNode)
+
+					case *ast.InterfaceType:
+						// 提取 Interface 方法
+						for _, method := range t.Methods.List {
+							if len(method.Names) > 0 {
+								p, r := extractSig(method.Type.(*ast.FuncType))
+								tNode.Methods = append(tNode.Methods, &Node{
+									Name: method.Names[0].Name, 
+									Params: p, 
+									Returns: r,
+								})
+							}
+						}
+						summary.Ifaces = append(summary.Ifaces, tNode)
+					}
+				}
+			} else if x.Tok == token.VAR {
+				for _, spec := range x.Specs {
+					vSpec := spec.(*ast.ValueSpec)
+					for _, name := range vSpec.Names {
+						summary.Vars = append(summary.Vars, name.Name)
+					}
+				}
+			} else if x.Tok == token.CONST {
+				for _, spec := range x.Specs {
+					cSpec := spec.(*ast.ValueSpec)
+					for _, name := range cSpec.Names {
+						summary.Consts = append(summary.Consts, name.Name)
+					}
+				}
+			}
+		}
+		return true
+	})
+	
+	// 通道类型检测补充
+	ast.Inspect(node, func(n ast.Node) bool {
+		if t, ok := n.(*ast.ChanType); ok {
+			_ = t
+			summary.Stats.Channels++ // Count definition of channels too
+		}
+		return true
+	})
+
+	// 注释采样 (取 doc)
+	if len(node.Comments) > 0 {
+		for i, cg := range node.Comments {
+			if i >= 5 { break } // limit
+			txt := strings.TrimSpace(cg.Text())
+			if len(txt) > 5 && !strings.HasPrefix(txt, "TODO") {
+				if len(txt) > 100 { txt = txt[:100] + "..." }
+				summary.Comments = append(summary.Comments, txt)
+			}
+		}
+	}
+
+	b, _ := json.Marshal(summary)
+	fmt.Println(string(b))
+}
+
+// 简单的复杂度计算 (McCabe 简化版)
+func calcComplexity(body *ast.BlockStmt) int {
+	count := 1
+	if body == nil { return count }
+	ast.Inspect(body, func(n ast.Node) bool {
+		switch n.(type) {
+		case *ast.IfStmt, *ast.ForStmt, *ast.RangeStmt, *ast.CaseClause:
+			count++
+		case *ast.BinaryExpr:
+			// 统计 && 和 ||
+			be := n.(*ast.BinaryExpr)
+			if be.Op == token.LAND || be.Op == token.LOR {
+				count++
+			}
+		}
+		return true
+	})
+	return count
+}
+
+// 提取函数签名
+func extractSig(t *ast.FuncType) (params, returns string) {
+	ps := []string{}
+	if t.Params != nil {
+		for _, f := range t.Params.List {
+			typeStr := formatType(f.Type)
+			if len(f.Names) == 0 {
+				ps = append(ps, typeStr)
+			} else {
+				for range f.Names {
+					ps = append(ps, typeStr) // 简化：只存类型，省 token
+				}
+			}
+		}
+	}
+	rs := []string{}
+	if t.Results != nil {
+		for _, f := range t.Results.List {
+			rs = append(rs, formatType(f.Type))
+		}
+	}
+	return strings.Join(ps, ","), strings.Join(rs, ",")
+}
+
+// 极简类型格式化
+func formatType(expr ast.Expr) string {
+	switch t := expr.(type) {
+	case *ast.Ident: return t.Name
+	case *ast.StarExpr: return "*" + formatType(t.X)
+	case *ast.SelectorExpr: return formatType(t.X) + "." + t.Sel.Name
+	case *ast.ArrayType: return "[]" + formatType(t.Elt)
+	case *ast.MapType: return "map[" + formatType(t.Key) + "]" + formatType(t.Value)
+	case *ast.InterfaceType: return "interface{}"
+	case *ast.ChanType: return "chan " + formatType(t.Value)
+	default: return "T"
+	}
+}
+"""
+
+# ---------------------------------------------------------
+# 工具函数
+# ---------------------------------------------------------
+
+def ensure_unicode(s):
+    if isinstance(s, text_type): return s
+    if isinstance(s, bytes): return s.decode('utf-8', errors='ignore')
+    return text_type(s)
+
+def get_gitignored_files(repo_path):
+    try:
+        ignored = subprocess.check_output(
+            ["git", "ls-files", "--others", "-i", "--exclude-standard"],
+            cwd=repo_path
+        ).decode('utf-8', errors='ignore').splitlines()
+        return set(ignored)
+    except Exception:
+        return set()
+
+# ---------------------------------------------------------
+# 核心分析器类
+# ---------------------------------------------------------
+
+class HybridGoExtractor:
+    def __init__(self, filepath, temp_go_parser_path=None):
+        self.filepath = filepath
+        self.temp_go_parser_path = temp_go_parser_path
+        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+            self.content = f.read()
+        
+        # 默认空数据结构 (Short Keys)
+        self.data = {
+            "pk": "unknown", "im": [], "st": [], "if": [], 
+            "fn": [], "md": {}, "vr": [], "cn": [], 
+            "cm": [], "stat": {"gr": 0, "ch": 0, "er": 0}
+        }
+
+    def process(self):
+        # 1. 尝试 Go AST 解析
+        if self.temp_go_parser_path and self._try_ast_parse():
+            return self.data
+        
+        # 2. 回退到正则解析
+        self._regex_parse()
+        return self.data
+
+    def _try_ast_parse(self):
+        """尝试运行嵌入的 Go AST 解析器"""
+        try:
+            # 调用 go run parser.go target.go
+            cmd = ["go", "run", self.temp_go_parser_path, self.filepath]
+            # 设置超时防止卡死
+            if sys.version_info[0] >= 3:
+                output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, timeout=5)
+            else:
+                output = subprocess.check_output(cmd, stderr=open(os.devnull, 'w'))
+            
+            parsed = json.loads(output.decode('utf-8'))
+            
+            # 映射数据
+            self.data = parsed
+            # 确保 map 存在
+            if not self.data.get("md"): self.data["md"] = {}
+            return True
+        except Exception:
+            return False
+
+    def _regex_parse(self):
+        """正则回退模式 (尽可能模拟 AST 输出结构)"""
+        c = self.content
+        
+        # Package
+        m = re.search(r'^\s*package\s+(\w+)', c, re.MULTILINE)
+        if m: self.data["pk"] = m.group(1)
+
+        # Imports
+        self.data["im"] = re.findall(r'import\s+"([^"]+)"', c)
+        multi = re.findall(r'import\s+\(\s*([\s\S]*?)\s*\)', c)
+        for blk in multi:
+            self.data["im"].extend(re.findall(r'"([^"]+)"', blk))
+        self.data["im"] = sorted(list(set(self.data["im"])))
+
+        # Functions (func Name(...) ...)
+        for m in re.finditer(r'func\s+(\w+)\s*\(([^)]*)\)\s*([^{]*)', c):
+            name, params, ret = m.groups()
+            # 简单计算复杂度 (count if/for)
+            body_start = m.end()
+            body_sample = c[body_start:body_start+500]
+            cx = body_sample.count("if ") + body_sample.count("for ") + 1
+            
+            self.data["fn"].append({
+                "n": name,
+                "p": params[:50], # truncate
+                "r": ret.strip()[:30],
+                "cx": cx
+            })
+
+        # Methods (func (r Type) Name(...) ...)
+        for m in re.finditer(r'func\s+\([^)]+\s+\*?(\w+)\)\s+(\w+)', c):
+            recv, name = m.groups()
+            if recv not in self.data["md"]: self.data["md"][recv] = []
+            self.data["md"][recv].append({"n": name})
+
+        # Structs
+        for m in re.finditer(r'type\s+(\w+)\s+struct', c):
+            self.data["st"].append({"n": m.group(1)})
+
+        # Interfaces
+        for m in re.finditer(r'type\s+(\w+)\s+interface', c):
+            self.data["if"].append({"n": m.group(1)})
+
+        # Stats Heuristics
+        self.data["stat"]["gr"] = c.count("go func")
+        self.data["stat"]["ch"] = c.count("chan ") + c.count("<-")
+        self.data["stat"]["er"] = c.count("if err != nil")
+
+        # Comments (Doc sampling)
+        comments = re.findall(r'//\s*(.+)', c)
+        good_comments = [x.strip()[:80] for x in comments if len(x) > 10 and "TODO" not in x]
+        self.data["cm"] = good_comments[:3]
+
+# ---------------------------------------------------------
+# 项目遍历与生成
+# ---------------------------------------------------------
+
+def rglob_go_files(root, include_test=False):
+    files = []
+    for dp, dn, fns in os.walk(ensure_unicode(root)):
+        # 过滤常见无关目录
+        dn[:] = [d for d in dn if not d.startswith('.') and d not in ['vendor', 'node_modules', 'testdata']]
+        for fn in fns:
+            if fn.endswith('.go'):
+                if not include_test and fn.endswith('_test.go'): continue
+                files.append(os.path.join(dp, fn))
+    return files
+
+def generate_snapshot(repo_path, output_path, graph=False, include_test=False):
+    repo_path = os.path.abspath(repo_path)
+    
+    # 1. 准备 AST 解析器临时文件
+    temp_dir = tempfile.mkdtemp()
+    ast_parser_path = os.path.join(temp_dir, "parser.go")
+    has_go = False
+    try:
+        subprocess.check_call(["go", "version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        with open(ast_parser_path, "w", encoding="utf-8") as f:
+            f.write(GO_AST_PARSER_SRC)
+        has_go = True
+        print("✅ Go environment detected. Using precise AST analysis.")
+    except:
+        print("⚠️ Go not found. Falling back to Regex analysis.")
+        ast_parser_path = None
+
+    # 2. 初始化摘要结构
+    digest = {
+        "root": os.path.basename(repo_path),
+        "pkgs": {}, # Key: package name
+        "graph": {}, # Dependency graph
+        "meta": {}
+    }
+    
+    pkg_map = defaultdict(list) # pkg_name -> list of file data
+    all_files = rglob_go_files(repo_path, include_test)
+    ignored = get_gitignored_files(repo_path)
+
+    print("Processing {} files...".format(len(all_files)))
+
+    # 3. 逐文件分析
+    total_cx = 0
+    total_err_checks = 0
+    
+    for fpath in all_files:
+        rel_path = os.path.relpath(fpath, repo_path)
+        if rel_path in ignored: continue
+        
+        extractor = HybridGoExtractor(fpath, ast_parser_path if has_go else None)
+        data = extractor.process()
+        
+        pkg_name = data.get("pk", "unknown")
+        
+        # 移除空字段以压缩 Token
+        clean_data = {k: v for k, v in data.items() if v}
+        clean_data["f"] = rel_path # 记录文件名
+        
+        # 统计聚合
+        if "stat" in data:
+            total_err_checks += data["stat"].get("er", 0)
+            # 复杂度累加
+            for fn in data.get("fn", []): total_cx += fn.get("cx", 1)
+        
+        pkg_map[pkg_name].append(clean_data)
+
+        # 构建依赖图 (仅保留非标准库)
+        deps = set()
+        for imp in data.get("im", []):
+            if "." in imp and not imp.startswith("github.com/user/repo"): # 简易过滤
+                deps.add(imp)
+        if deps:
+            if pkg_name not in digest["graph"]: digest["graph"][pkg_name] = []
+            digest["graph"][pkg_name].extend(list(deps))
+
+    # 4. 聚合包数据 (Package Level Aggregation)
+    # 为了进一步压缩，我们将同一个包下的文件内容合并，或者按文件列表列出
+    for pkg, files_data in pkg_map.items():
+        digest["pkgs"][pkg] = {
+            "files": len(files_data),
+            "cx_avg": 0, # 平均复杂度
+            "contents": files_data # 这里包含详细的 struct/func
+        }
+        
+        # 去重依赖图
+        if pkg in digest["graph"]:
+            digest["graph"][pkg] = sorted(list(set(digest["graph"][pkg])))
+
+    # 5. 生成元数据 Summary
+    digest["meta"] = {
+        "files": len(all_files),
+        "pkgs": len(pkg_map),
+        "total_complexity": total_cx,
+        "error_hotspots": total_err_checks,
+        "strategy": "AST" if has_go else "Regex"
+    }
+
+    # 清理临时文件
+    shutil.rmtree(temp_dir)
+
+    # 6. 写入 YAML
+    with open(output_path, "w", encoding="utf-8") as f:
+        yaml.dump(digest, f, default_flow_style=False, sort_keys=False, width=120, allow_unicode=True)
+    
+    print("✅ Snapshot generated: {} ({:.1f}KB)".format(output_path, os.path.getsize(output_path)/1024.0))
+
+    # 7. 图形生成 (可选)
+    if graph:
+        try:
+            import networkx as nx
+            import matplotlib.pyplot as plt
+            G = nx.DiGraph()
+            for src, dsts in digest["graph"].items():
+                for dst in dsts:
+                    # 简化节点名: github.com/a/b -> b
+                    short_dst = dst.split('/')[-1]
+                    G.add_edge(src, short_dst)
+            
+            plt.figure(figsize=(12, 12))
+            pos = nx.spring_layout(G, k=0.5)
+            nx.draw(G, pos, with_labels=True, node_size=1500, node_color="lightblue", font_size=8, arrowsize=15)
+            plt.savefig(output_path.replace(".yaml", ".png"))
+            print("✅ Graph saved.")
+        except ImportError:
+            print("⚠️  Skipping graph (networkx/matplotlib missing).")
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Go Semantic Snapshot Generator v2 (AST+Regex)")
+    parser.add_argument("repo", help="Go project path")
+    parser.add_argument("-o", "--out", default="go_digest.yaml", help="Output file")
+    parser.add_argument("--graph", action="store_true", help="Generate dependency graph")
+    parser.add_argument("--no-test", action="store_true", help="Exclude _test.go")
+    
+    args = parser.parse_args()
+    generate_snapshot(args.repo, args.out, args.graph, not args.no_test)
+
diff --git a/semantic-snapshot/py-semantic-snapshot-v3.py b/semantic-snapshot/py-semantic-snapshot-v3.py
new file mode 100644
index 0000000..041e744
--- /dev/null
+++ b/semantic-snapshot/py-semantic-snapshot-v3.py
@@ -0,0 +1,795 @@
+# -*- coding: utf-8 -*-
+"""
+py-semantic-snapshot.py (V3.0)
+Python 项目语义快照生成器 - 实现极致 Token 压缩与无损理解
+
+主要功能更新 (V3.0):
+1. 数据增强：记录常量/变量/类属性的初始值（截断），支持Docstring参数/返回信息提取。
+2. 上下文追溯：记录内部 import 语句的行号。
+3. 摘要量化：模块级摘要 (sm) 增加平均 CCN 和高 CCN 函数计数。
+4. Token控制：新增 CLI 参数控制 Docstring 和参数列表的截断长度。
+5. 采样功能：对高 CCN 函数进行代码片段采样 (sample)。
+
+使用示例：
+    python3.9 py-semantic-snapshot.py ./project -o digest.yaml --max-doc-len 100 --ccn-threshold 10
+"""
+
+from __future__ import unicode_literals
+import os
+import ast
+import re
+import subprocess
+import sys
+import argparse
+from collections import defaultdict
+
+try:
+    import yaml  # 需要: pip install PyYAML
+    import networkx as nx # 需要: pip install networkx
+    import matplotlib
+    import matplotlib.pyplot as plt # 需要: pip install matplotlib
+    
+    # 兼容服务器 / 无显示环境
+    matplotlib.use("Agg")
+    CAN_GRAPH = True
+except ImportError:
+    print("⚠️ 缺少依赖: PyYAML, networkx, 或 matplotlib。图表功能将跳过。")
+    CAN_GRAPH = False
+
+
+# --- 配置与工具函数 ---
+if sys.version_info[0] == 2:
+    text_type = unicode
+    string_types = (str, unicode)
+else:
+    text_type = str
+    string_types = (str,)
+
+def ensure_unicode(s):
+    if isinstance(s, text_type):
+        return s
+    if isinstance(s, bytes):
+        return s.decode("utf-8", errors="ignore")
+    return text_type(s)
+
+def get_gitignored_files(repo_path):
+    try:
+        ignored = subprocess.check_output(
+            ["git", "ls-files", "--others", "-i", "--exclude-standard"],
+            cwd=repo_path,
+        ).decode("utf-8", errors="ignore").splitlines()
+        return set(ignored)
+    except Exception:
+        return set()
+    
+# --- 复杂度计算辅助函数 ---
+
+def calculate_complexity(node):
+    """
+    计算简化版的 McCabe 圈复杂度 (CCN)。
+    修订：CCN 从 1 开始，并确保能够处理函数体（一个 AST 节点列表）。
+    """
+    complexity = 1  # 默认复杂度为 1 (函数定义本身)
+    
+    # AST 节点或节点列表
+    nodes_to_walk = node if isinstance(node, (list, tuple)) else [node]
+    
+    def count_control_flow(n):
+        nonlocal complexity
+        
+        if isinstance(n, (ast.If, ast.For, ast.While, ast.AsyncFor, ast.With, ast.AsyncWith)):
+            complexity += 1
+        elif isinstance(n, ast.Try):
+            # 基础 Try (1) + 每个 Except/Else/Finally 块的附加路径 (这里只算每个 except 处理器)
+            complexity += len(n.handlers)
+        elif isinstance(n, ast.BoolOp):
+            if isinstance(n.op, (ast.And, ast.Or)):
+                complexity += len(n.values) - 1
+        elif isinstance(n, (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)):
+            for generator in n.generators:
+                complexity += len(generator.ifs)
+
+    # 遍历所有节点，如果是列表，则遍历列表中的每个根节点
+    for root_node in nodes_to_walk:
+        if root_node:
+            # 正确使用 ast.walk: 遍历节点及其所有子节点
+            # 注意: 如果传入的是函数体列表，我们已经在外层 for 循环处理了列表本身
+            # 此时 root_node 就是列表中的一个语句节点
+            for n in ast.walk(root_node):
+                count_control_flow(n) # 对遍历到的每个节点调用计数函数
+    
+    # 原始代码在 _process_function_or_method 中传入的是 node.body (list)，
+    # 因此函数体列表中的节点都需要被遍历。
+    # 经过上述修改，如果传入的是列表，我们将遍历列表中的每个元素。
+    
+    return complexity
+
+# --- AST 语义提取器 (V3.0) ---
+
+IO_CALLS = {
+    "open": "File_IO", "read": "File_IO", "write": "File_IO",
+    "requests.get": "Network_HTTP", "requests.post": "Network_HTTP",
+    "socket.socket": "Network_Socket", 
+    "db.connect": "Database_Op", "cursor.execute": "Database_Op",
+    "subprocess.run": "IPC_Process", "os.popen": "IPC_Process",
+}
+
+class PythonSemanticExtractor(ast.NodeVisitor):
+    """使用 AST 提取 Python 代码语义结构 (V3.0, 短键名)"""
+
+    def __init__(self, args):
+        self.args = args  # 接收 CLI 参数
+        self.info = {
+            "im": [],              # imports (带行号)
+            "fim": {},             # from_imports: {module: [items]} (带行号)
+            "cl": [],              # classes
+            "fn": [],              # functions 
+            "md": {},              # methods
+            "ds": [],              # docstrings (sampled)
+            "cv": [],              # constants/vars (带值和类型)
+            "cl_attr": [],         # class attributes (新增)
+            "dc": [],              # decorators (模块级)
+            "th": [],              # type hints (模块级)
+            "stat": {
+                "async": 0,           
+                "th": 0,              
+                "io": defaultdict(int),   
+                "err": {"total": 0, "generic": 0},  
+            },
+        }
+        self.current_class = None
+        # 存储文件内容行，用于代码片段采样 (需求6)
+        self.content_lines = [] 
+        
+    def set_content_lines(self, lines):
+        self.content_lines = lines
+
+    # --- Import (新增行号追踪 - 需求2) ---
+
+    def visit_Import(self, node):
+        for alias in node.names:
+            # 记录 import 语句的行号
+            self.info["im"].append({"n": alias.name, "ln": node.lineno})
+        self.generic_visit(node)
+
+    def visit_ImportFrom(self, node):
+        module = node.module or ""
+        # 记录 import 语句的行号
+        items = [{"n": alias.name, "ln": node.lineno} for alias in node.names]
+        self.info["fim"].setdefault(module, []).extend(items)
+        self.generic_visit(node)
+
+    # --- Class (新增类属性提取 - 需求3a) ---
+
+    def visit_ClassDef(self, node):
+        class_info = {
+            "n": node.name,
+            "ln": node.lineno, 
+            "bs": [self._get_name(base) for base in node.bases], 
+            "dc": [self._get_name(dec) for dec in node.decorator_list], 
+            "attrs": [] # 类属性列表
+        }
+        
+        # Docstring 采样
+        docstring = ast.get_docstring(node)
+        if docstring and len(docstring) > 10:
+            doc_len = min(len(docstring), self.args.max_docstring_len)
+            self.info["ds"].append({"t": "cl", "n": node.name, "doc": docstring[:doc_len] + ("..." if doc_len < len(docstring) else "")})
+
+        # 遍历类体，提取类属性（Assignment 在方法定义前出现）
+        for item in node.body:
+            if isinstance(item, ast.Assign):
+                # Class attributes (V3.0)
+                value_repr = self._get_annotation(item.value, max_len=self.args.max_assign_len)
+                for target in item.targets:
+                    if isinstance(target, ast.Name):
+                        name = target.id
+                        is_constant = name.isupper() and (name.replace('_', '').isalnum())
+                        class_info["attrs"].append({
+                            "n": name, 
+                            "ln": target.lineno,
+                            "const": is_constant,
+                            "val": value_repr, 
+                        })
+            elif isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                # 遇到函数/方法后，停止查找类属性
+                break 
+
+        self.info["cl"].append(class_info)
+
+        # 进入类上下文
+        old_class = self.current_class
+        self.current_class = node.name
+        self.generic_visit(node)
+        self.current_class = old_class
+
+    # --- Function/Method (新增 Docstring 参数解析/Arg截断/代码采样 - 需求3b/4/6) ---
+
+    def _parse_docstring_params(self, docstring):
+        """简单启发式解析 Docstring 中的参数和返回值 (需求3b)"""
+        if not docstring: return None
+        
+        parsed = {}
+        # 匹配 Google/Sphinx 风格的参数和返回
+        param_matches = re.findall(r"^\s*(?:Args|Parameters|:param)\s*:\s*(\w+)\s*(?:(?:[ \t]|\([\w,]+\))?:?\s*(.*))?", docstring, re.MULTILINE | re.IGNORECASE)
+        return_matches = re.findall(r"^\s*(?:Returns|:returns:)\s*:\s*(.*)", docstring, re.MULTILINE | re.IGNORECASE)
+
+        if param_matches:
+            parsed["p"] = [f"{n}: {d.strip()}" for n, d in param_matches if n]
+        if return_matches:
+            parsed["r"] = [r.strip() for r in return_matches]
+        
+        return parsed if parsed else None
+
+
+    def _process_function_or_method(self, node):
+        # 传入 node.body 以计算 CCN (因为 CCN = 1 已经在 calculate_complexity 中计算)
+        ccn = calculate_complexity(node.body)
+        
+        # 参数截断 (需求4)
+        args_list = [arg.arg for arg in node.args.args]
+        if len(args_list) > self.args.max_args_len:
+            args_list = args_list[:self.args.max_args_len] + ["..."]
+
+        func_info = {
+            "n": node.name,
+            "ln": node.lineno,
+            "cx": ccn, 
+            "args": args_list,
+            "dc": [self._get_name(dec) for dec in node.decorator_list],
+            "ret": self._get_annotation(node.returns),
+            "async": isinstance(node, ast.AsyncFunctionDef),
+        }
+
+        if func_info["async"]: self.info["stat"]["async"] += 1
+        
+        # 代码片段采样 (需求6)
+        if ccn >= self.args.ccn_threshold and self.content_lines:
+            start_line = node.lineno
+            end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 5
+            
+            # 采样 N 行
+            sample_end = min(end_line, start_line + self.args.code_sample_lines)
+            # AST行号是1-based，列表是0-based。我们想要包含 node.lineno 行，所以从 node.lineno - 1 开始
+            # 但是为了获取函数体的行，我们从 start_line 开始（即定义行）
+            sample_lines = self.content_lines[start_line - 1 : sample_end]
+            
+            # 移除公共缩进
+            if sample_lines:
+                try:
+                    # 找到定义行的缩进（第一行）
+                    indent = len(sample_lines[0]) - len(sample_lines[0].lstrip())
+                    # 对后续行应用相同的缩进移除
+                    func_info["sample"] = [line[indent:].rstrip() for line in sample_lines]
+                except IndexError:
+                    pass # 无法获取缩进
+
+        # Docstring 采样与解析 (需求3b, 4)
+        docstring = ast.get_docstring(node)
+        if docstring and len(docstring) > 10:
+            params_from_doc = self._parse_docstring_params(docstring)
+            doc_len = min(len(docstring), self.args.max_docstring_len)
+
+            doc_entry = {
+                "t": "md" if self.current_class else "fn",
+                "n": node.name, 
+                "doc": docstring[:doc_len] + ("..." if doc_len < len(docstring) else "")
+            }
+            if params_from_doc:
+                doc_entry["doc_p"] = params_from_doc
+            self.info["ds"].append(doc_entry)
+
+        if self.current_class:
+            self.info["md"].setdefault(self.current_class, []).append(func_info)
+        else:
+            self.info["fn"].append(func_info)
+
+    def visit_FunctionDef(self, node):
+        self._process_function_or_method(node)
+        self.generic_visit(node)
+
+    def visit_AsyncFunctionDef(self, node):
+        self._process_function_or_method(node)
+        self.generic_visit(node)
+
+    # --- Assign (新增常量/变量信息 - 需求1) ---
+
+    def visit_Assign(self, node):
+        # 如果在 ClassDef 外 (模块级变量/常量)
+        if self.current_class is None:
+            value_repr = self._get_annotation(node.value, max_len=self.args.max_assign_len) 
+            
+            for target in node.targets:
+                if isinstance(target, ast.Name):
+                    name = target.id
+                    is_constant = name.isupper() and (name.replace('_', '').isalnum()) # 启发式判断
+                    
+                    self.info["cv"].append({
+                        "n": name, 
+                        "ln": target.lineno,
+                        "const": is_constant,
+                        "val": value_repr, 
+                    })
+        
+        # 注意: 类属性的提取已移至 visit_ClassDef 中，以避免在方法内赋值也被误判为类属性。
+        self.generic_visit(node)
+        
+    # --- 深度分析：I/O, IPC, Error Handling (V2.0 保留) ---
+    
+    def visit_Call(self, node):
+        call_name = self._get_name(node.func)
+        for keyword, category in IO_CALLS.items():
+            if keyword in call_name:
+                self.info["stat"]["io"][category] += 1
+        self.generic_visit(node)
+        
+    def visit_Try(self, node):
+        self.info["stat"]["err"]["total"] += 1
+        for handler in node.handlers:
+            if handler.type is None or self._get_name(handler.type) in ["Exception", "BaseException"]:
+                self.info["stat"]["err"]["generic"] += 1
+                if "generic_excepts" not in self.info["stat"]["err"]:
+                     self.info["stat"]["err"]["generic_excepts"] = []
+                self.info["stat"]["err"]["generic_excepts"].append(handler.lineno)
+        self.generic_visit(node)
+
+    # --- 辅助方法 (V3.0 优化 `_get_annotation` 的值捕获) ---
+
+    def _get_name(self, node):
+        if isinstance(node, ast.Name):
+            return node.id
+        elif isinstance(node, ast.Attribute):
+            base = self._get_name(node.value)
+            return "{}.{}".format(base, node.attr) if base else node.attr
+        elif isinstance(node, ast.Call):
+            return self._get_name(node.func)
+        return ""
+
+    def _get_annotation(self, node, max_len=50):
+        if node is None:
+            return None
+        
+        # 尝试使用 ast.unparse (Python 3.9+) 获取表达式的字符串表示
+        if hasattr(ast, "unparse"):
+            try:
+                representation = ast.unparse(node).strip()
+                if len(representation) > max_len:
+                    return representation[:max_len] + "..."
+                return representation
+            except Exception:
+                # Fallback to type names if unparse fails
+                pass 
+
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Constant):
+            # 捕获字面量类型和值
+            val = text_type(node.value)
+            type_name = type(node.value).__name__
+            # 限制 val 的长度以避免超长 token
+            if len(val) > max_len:
+                val = val[:max_len] + "..."
+            return f"{type_name}({val})"
+        if isinstance(node, ast.Subscript):
+            value_name = self._get_name(node.value)
+            return "{}[...]".format(value_name or "Subscript")
+        
+        return text_type(node)
+
+
+# --- 解析入口函数 (适配 V3.0) ---
+
+def extract_python_semantics(filepath, args):
+    """对单个 Python 文件做 AST 语义提取 (V3.0)"""
+    info = {
+        "im": [], "fim": {}, "cl": [], "fn": [], "md": {}, "ds": [], "cv": [], "dc": [], "th": [], "cl_attr": [],
+        "stat": {"async": 0, "th": 0, "io": {}, "err": {"total": 0, "generic": 0}},
+    }
+    content = ""
+    content_lines = []
+
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            content = f.read()
+        content = ensure_unicode(content)
+        # 用于代码采样，保留换行符
+        content_lines = content.splitlines(keepends=True) 
+        # AST 行号是 1-based，但 for 循环中获取 start_line: sample_end 已经调整，这里用 splitlines() 不保留换行，方便处理缩进
+        content_lines = content.splitlines() 
+    except Exception:
+        return info
+
+    try:
+        # Python 3.8+ 支持 end_lineno 和 end_col_offset
+        tree = ast.parse(content)
+        extractor = PythonSemanticExtractor(args)
+        extractor.set_content_lines(content_lines)
+        extractor.visit(tree)
+        info.update(extractor.info)
+        info["stat"]["io"] = dict(info["stat"]["io"])
+    except SyntaxError:
+        # AST 解析失败时，回退到正则
+        print(f"⚠️ 语法错误，文件 {filepath} 使用正则兜底。")
+        return extract_python_semantics_fallback(content)
+    except Exception as e:
+        print(f"❌ 严重错误：解析 {filepath} 失败：{e}。使用正则兜底。")
+        return extract_python_semantics_fallback(content)
+
+    return info
+
+
+def extract_python_semantics_fallback(content):
+    """AST 解析失败时的兜底：基于正则的粗粒度提取"""
+    # ... (V2.0 逻辑保持不变, 无法提供 V3.0 深度信息) ...
+    info = {
+        "im": [], "fim": {}, "cl": [], "fn": [], "md": {}, "ds": [], "cv": [], "dc": [], "th": [], "cl_attr": [],
+        "stat": {"async": 0, "th": 0, "io": {}, "err": {"total": 0, "generic": 0}},
+    }
+
+    # 正则提取: 仅提取名称和参数 (无 CCN/ln/val/sample)
+    # 这里我们尝试模拟行号，但只能是粗略的近似
+    content_lines = content.splitlines()
+    
+    # Imports
+    for i, line in enumerate(content_lines):
+        match_import = re.match(r"^\s*import\s+([\w\.]+)", line)
+        if match_import:
+            info["im"].append({"n": match_import.group(1), "ln": i + 1})
+        
+        match_from_import = re.match(r"^\s*from\s+([\w\.]+)\s+import\s+(.+)", line)
+        if match_from_import:
+            module, items_str = match_from_import.groups()
+            items_list = [{"n": item.strip(), "ln": i + 1} for item in items_str.split(",")]
+            info["fim"].setdefault(module, []).extend(items_list)
+
+    # Classes and Functions (需要更复杂的正则来获取准确的行号和参数)
+    for i, line in enumerate(content_lines):
+        match_class = re.match(r"^\s*class\s+(\w+)(?:\(([^)]*)\))?:", line)
+        if match_class:
+            name, bases = match_class.groups()
+            info["cl"].append({"n": name, "ln": i + 1, "bs": [b.strip() for b in bases.split(",")] if bases else [], "attrs": []})
+        
+        match_func = re.match(r"^\s*(async\s+)?def\s+(\w+)\s*\(([^)]*)\)", line)
+        if match_func:
+            is_async, name, args_str = match_func.groups()
+            args = [a.strip() for a in args_str.split(",")] if args_str else []
+            func_info = {"n": name, "ln": i + 1, "cx": 1, "args": args, "async": bool(is_async)}
+            info["fn"].append(func_info)
+            if is_async: info["stat"]["async"] += 1
+            
+    # 清理 fn/cl 列表中的重复项（正则可能在多行匹配中出错，虽然这里是单行匹配）
+    # 由于是兜底，我们接受其粗糙性。
+    
+    return info
+
+
+# --- 依赖图绘制 (保留 V2.0 逻辑) ---
+
+def generate_dependency_graph(import_graph, output_path_base):
+    # ... (V2.0 逻辑保持不变) ...
+    if not CAN_GRAPH:
+        return
+
+    G = nx.DiGraph()
+
+    for module, deps in import_graph.items():
+        if not deps:
+            G.add_node(module)
+        for dep in deps:
+            G.add_edge(module, dep)
+
+    if len(G.nodes) == 0:
+        print("ℹ️  依赖图为空，跳过生成。")
+        return
+
+    try:
+        k = 1.0 / max(len(G.nodes), 1) ** 0.5
+        pos = nx.spring_layout(G, k=k, iterations=80)
+    except Exception:
+        pos = nx.spring_layout(G)
+
+    base_size = max(8, min(20, len(G.nodes) * 0.4))
+    plt.figure(figsize=(base_size, base_size * 0.75))
+
+    nx.draw_networkx_nodes(G, pos, node_size=800)
+    nx.draw_networkx_edges(G, pos, arrows=True, arrowstyle="-|>", arrowsize=12)
+    nx.draw_networkx_labels(G, pos, font_size=8)
+
+    plt.axis("off")
+    png_path = output_path_base + ".png"
+    svg_path = output_path_base + ".svg"
+
+    try:
+        plt.tight_layout()
+        plt.savefig(png_path, dpi=150)
+        plt.savefig(svg_path, dpi=150)
+        plt.close()
+        print("✅ Dependency graph generated: {}, {}".format(png_path, svg_path))
+    except Exception as e:
+        print("⚠️  依赖图保存失败: {}".format(ensure_unicode(str(e))))
+        plt.close()
+
+
+# --- 项目遍历 & 摘要生成 (V3.0) ---
+
+CONFIG_FILES = [
+    "requirements.txt", "setup.py", "pyproject.toml",
+    "Pipfile", "poetry.lock", "tox.ini",
+]
+
+def rglob_py_files(root_path):
+    # ... (V2.0 逻辑保持不变) ...
+    py_files = []
+    root_path = ensure_unicode(root_path)
+    for dirpath, dirnames, filenames in os.walk(root_path):
+        dirpath = ensure_unicode(dirpath)
+        dirnames[:] = [
+            ensure_unicode(d) for d in dirnames
+            if not ensure_unicode(d).startswith(".") and d not in ["__pycache__", "venv", "env", ".venv", "node_modules", "build", "dist", ".tox"]
+        ]
+        for filename in filenames:
+            filename = ensure_unicode(filename)
+            if filename.endswith(".py") and not filename.startswith("."):
+                py_files.append(os.path.join(dirpath, filename))
+    return py_files
+
+
+def generate_semantic_digest(repo_path, output_path, args):
+    """生成 Python 项目的语义摘要（V3.0）"""
+    repo_path = ensure_unicode(os.path.abspath(repo_path))
+    ignored = get_gitignored_files(repo_path)
+
+    digest = {
+        "root": repo_path,
+        "type": "python",
+        "files": [],                       
+        "modules": defaultdict(
+            lambda: {
+                "f": [],                    
+                "im": [],                   # list of dicts (name, ln)
+                "fim": defaultdict(list),   # dict of lists (name, ln)
+                "cl": [],                   
+                "fn": [],                   
+                "md": {},                   
+                "dc": set(),                
+                "ds": [],                   
+                "cv": [],                   # module vars/consts
+                "cl_attr": [],              # class attributes
+                "stat": {},                 
+            }
+        ),
+        "deps": defaultdict(set),           
+        "sum": {},                          
+    }
+    
+    found_config_files = [f for f in CONFIG_FILES if os.path.exists(os.path.join(repo_path, f))]
+    all_imports = set()
+    total_ccn = 0
+    
+    for path in rglob_py_files(repo_path):
+        path = ensure_unicode(path)
+        rel_path = ensure_unicode(os.path.relpath(path, repo_path))
+
+        if any(skip in rel_path for skip in ["test_", "_test.py", "/tests/", "/venv/", "/env/", "/.venv/", "/build/", "/dist/", "/__pycache__/", "/site-packages/"]):
+            continue
+        if rel_path in ignored:
+            continue
+
+        semantics = extract_python_semantics(path, args) # 传入 args
+
+        module_parts = rel_path.replace(".py", "").replace(os.sep, ".").split(".")
+        if module_parts and module_parts[-1] == "__init__":
+            module_parts = module_parts[:-1]
+        module_name = ".".join(module_parts) if module_parts else "root"
+
+        mod_entry = digest["modules"][module_name]
+        mod_entry["f"].append(rel_path)
+        digest["files"].append(rel_path)
+
+        # 聚合结构
+        mod_entry["im"].extend(semantics.get("im", []))
+        mod_entry["cv"].extend(semantics.get("cv", []))
+        mod_entry["cl_attr"].extend(semantics.get("cl_attr", []))
+        
+        for imp in semantics.get("im", []):
+            all_imports.add(imp["n"])
+        
+        for module, items in semantics.get("fim", {}).items():
+            mod_entry["fim"][module].extend(items)
+            all_imports.add(module)
+        
+        mod_entry["cl"].extend(semantics.get("cl", []))
+        mod_entry["fn"].extend(semantics.get("fn", []))
+        for class_name, methods in semantics.get("md", {}).items():
+            mod_entry["md"].setdefault(class_name, []).extend(methods)
+            
+        mod_entry["dc"].update(semantics.get("dc", []))
+        mod_entry["ds"].extend(semantics.get("ds", []))
+        
+        mod_entry["stat"].update(semantics.get("stat", {}))
+        
+        # 累加 CCN
+        total_ccn += sum(f.get("cx", 1) for f in semantics.get("fn", []))
+        total_ccn += sum(m.get("cx", 1) for methods in semantics.get("md", {}).values() for m in methods)
+
+        # 构建依赖图 (使用导入名，不使用行号)
+        for imp in semantics.get("im", []):
+            # 过滤掉常见的标准库（它们通常不构成项目内部依赖图）
+            if not imp["n"].startswith(("sys", "os", "re", "json", "time", "datetime", "logging", "collections", "io", "abc", "math", "random", "unittest", "zipfile")):
+                digest["deps"][module_name].add(imp["n"])
+        for module, _items in semantics.get("fim", {}).items():
+            if module and not module.startswith(("sys", "os", "re", "json", "time", "datetime", "logging", "collections", "io", "abc", "math", "random", "unittest", "zipfile")):
+                digest["deps"][module_name].add(module)
+
+
+    # --- 收尾：清理、聚合、生成项目总结 (V3.0) ---
+
+    total_functions = 0
+    total_mod_ccn = 0
+    
+    # 1. 模块级清理和总结
+    for module_name, data in digest["modules"].items():
+        data["im"] = sorted(data["im"], key=lambda x: x["n"])
+        data["dc"] = sorted(list(data["dc"]))
+
+        # 精简 docstrings
+        unique_docs = []
+        seen = set()
+        for doc in data["ds"]:
+            key = (doc.get("t"), doc.get("n"))
+            if key not in seen:
+                seen.add(key)
+                unique_docs.append(doc)
+        data["ds"] = unique_docs[:5]
+
+        # CCN Metrics (用于 sm 摘要 - 需求5)
+        all_cx = [f.get("cx", 1) for f in data["fn"]]
+        all_cx.extend(m.get("cx", 1) for methods in data["md"].values() for m in methods)
+        
+        count_functions = len(data["fn"]) + sum(len(m) for m in data["md"].values())
+        total_functions += count_functions
+        
+        total_module_ccn = sum(all_cx)
+        total_mod_ccn += total_module_ccn
+        
+        avg_ccn = round(total_module_ccn / max(count_functions, 1), 1)
+        high_ccn_count = sum(1 for cx in all_cx if cx >= args.ccn_threshold)
+
+        # 模块级简短 summary (sm) - V3.0
+        summary_parts = []
+        if data["cl"]: summary_parts.append("CLS:{}".format(len(data["cl"])))
+        if count_functions > 0: 
+            summary_parts.append("FN:{}".format(count_functions))
+            summary_parts.append("AVG_CX:{}".format(avg_ccn)) # 需求5
+            if high_ccn_count > 0:
+                 summary_parts.append("HIGH_CX:{}".format(high_ccn_count)) # 需求5
+        if data["stat"].get("async"): summary_parts.append("ASYNC")
+        if data["stat"].get("th"): summary_parts.append("TH")
+        if any(v > 0 for v in data["stat"].get("io", {}).values()): summary_parts.append("IO/IPC")
+        if data["stat"].get("err", {}).get("generic") > 0: summary_parts.append("GenericErr:{}".format(data["stat"]["err"]["generic"]))
+
+        data["sm"] = "；".join(summary_parts) if summary_parts else "Python Module"
+
+    digest["modules"] = dict(digest["modules"])
+
+    # 2. 依赖图 dict 化
+    dep_graph_dict = {}
+    for mod, deps in digest["deps"].items():
+        dep_graph_dict[mod] = sorted(list(deps))
+    digest["deps"] = dep_graph_dict
+
+    # 3. 项目级 summary (sum)
+    total_modules = len(digest["modules"])
+    total_classes = sum(len(m["cl"]) for m in digest["modules"].values())
+
+    std_libs = {"os", "sys", "re", "json", "time", "logging", "datetime", "abc", "collections", "io", "math", "random", "unittest", "zipfile"}
+    project_pkgs = set(digest["modules"].keys())
+    # 筛选出非标准库且非项目内部模块的导入，作为技术栈
+    tech_stack = sorted(list((all_imports - std_libs) - project_pkgs))
+
+    digest["sum"] = {
+        "mod_count": total_modules,
+        "cl_count": total_classes,
+        "fn_count": total_functions,
+        "file_count": len(digest["files"]),
+        "total_ccn": total_mod_ccn,
+        "config_files": found_config_files, 
+        "tech_stack": tech_stack[:10], 
+        "has_async": any(m.get("stat", {}).get("async") > 0 for m in digest["modules"].values()),
+        # 注意: 无法准确统计 type hints，保持原字段但依赖 AST 结构 (这里简单保持 V2.0 逻辑)
+        "uses_type_hints": False, 
+    }
+
+    output_path = ensure_unicode(output_path)
+    try:
+        with open(output_path, "w", encoding="utf-8") as f:
+            yaml_content = yaml.dump(
+                digest,
+                allow_unicode=True,
+                default_flow_style=False,
+                sort_keys=False,
+                width=120,
+            )
+            f.write(yaml_content)
+
+        print("✅ Semantic project digest generated: {}".format(output_path))
+        print(
+            "   📊 Stats (V3.0): {} modules, {} classes, {} functions, Total CCN: {}".format(
+                total_modules, total_classes, total_functions, total_mod_ccn
+            )
+        )
+
+        try:
+            graph_base = output_path.replace(".yaml", "_dependency_graph")
+            generate_dependency_graph(digest["deps"], graph_base)
+        except Exception as e:
+            print("⚠️  生成依赖图时出错: {}".format(ensure_unicode(str(e))))
+            
+    except Exception as e:
+        print("❌ 写入输出文件时发生错误: {}".format(ensure_unicode(str(e))))
+
+
+# --- CLI 入口 (V3.0 新增参数) ---
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=(
+            "生成 Python 项目的语义摘要（YAML）和依赖关系图（PNG/SVG），"
+            "用于大模型理解项目结构 & 大幅减少 token (V3.0)."
+        ),
+        epilog=(
+            "示例:\n"
+            "  python3.9 py-semantic-snapshot.py ./project -o digest.yaml --max-doc-len 100 --ccn-threshold 10"
+        ),
+    )
+    parser.add_argument("repo_path", help="本地 Python 项目路径")
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="python_semantic_digest.yaml",
+        help="输出 YAML 文件路径",
+    )
+    # V3.0 新增 Token 控制和采样参数 (需求4, 6)
+    parser.add_argument(
+        "--max-doc-len",
+        type=int,
+        default=200,
+        dest="max_docstring_len",
+        help="Docstring 采样最大长度 (Token 优化).",
+    )
+    parser.add_argument(
+        "--max-args-len",
+        type=int,
+        default=5,
+        dest="max_args_len",
+        help="函数参数列表最大数量，超出则截断 (Token 优化).",
+    )
+    parser.add_argument(
+        "--max-assign-len",
+        type=int,
+        default=30,
+        dest="max_assign_len",
+        help="常量/变量/属性初始值表达式的最大长度 (Token 优化).",
+    )
+    parser.add_argument(
+        "--ccn-threshold",
+        type=int,
+        default=10,
+        dest="ccn_threshold",
+        help="CCN 阈值。高于此值的函数，其代码片段将被采样 (需求6) 并在摘要中标记 (需求5).",
+    )
+    parser.add_argument(
+        "--code-sample-lines",
+        type=int,
+        default=5,
+        dest="code_sample_lines",
+        help="高 CCN 函数的代码采样行数 (需求6).",
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.repo_path):
+        print("❌ Error: Path '{}' does not exist".format(args.repo_path))
+        sys.exit(1)
+
+    generate_semantic_digest(args.repo_path, args.output, args)

From 2d80aed3642e821443e774857e646023175856a4 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sun, 30 Nov 2025 13:28:54 +0000
Subject: [PATCH 2/2] feat: align snapshot create with semantic scripts

Refactors the `codesage snapshot create` command to produce output
identical to the standalone `py-semantic-snapshot-v3.py` and
`go-semantic-snapshot-v4.py` scripts, while preserving the existing
`--language` and `--format` options.

- Implements `python-semantic-digest` and `go-semantic-digest` formats
  to generate compatible semantic snapshots for Python and Go projects.
- The Go snapshot builder now compiles and runs a dedicated Go AST
  parser for accurate analysis.
- The `--language` flag is preserved to select the language parser,
  while the `--format` flag selects the output type.
- The default JSON snapshot logic for other languages (e.g., Shell,
  Java) is preserved.
- Restores and adds tests for `snapshot show` and `snapshot cleanup`
  subcommands to prevent regressions.
- Adds a `--keep` option to the `snapshot cleanup` command to allow
  overriding the number of snapshots to retain.
---
 codesage/cli/commands/snapshot.py             |  72 ++-
 .../semantic_digest/go_snapshot_builder.py    | 476 +++++++++++++-----
 .../python_snapshot_builder.py                | 392 ++++++++++-----
 codesage/snapshot/versioning.py               |  66 ++-
 codesage/snapshot/yaml_generator.py           |  13 +-
 examples/go_test/main.go                      |  24 +
 examples/python_test/main.py                  |  13 +
 go_test_codesage.yaml                         |   8 +
 go_test_script.yaml                           |   8 +
 python_test_codesage.yaml                     |  70 +++
 python_test_script.yaml                       |  71 +++
 semantic-snapshot/py-semantic-snapshot-v3.py  |  18 +-
 tests/cli/test_snapshot.py                    | 241 +++++++++
 tests/cli/test_snapshot_command.py            |  34 --
 14 files changed, 1133 insertions(+), 373 deletions(-)
 create mode 100644 examples/go_test/main.go
 create mode 100644 examples/python_test/main.py
 create mode 100644 go_test_codesage.yaml
 create mode 100644 go_test_script.yaml
 create mode 100644 python_test_codesage.yaml
 create mode 100644 python_test_script.yaml
 create mode 100644 tests/cli/test_snapshot.py
 delete mode 100644 tests/cli/test_snapshot_command.py

diff --git a/codesage/cli/commands/snapshot.py b/codesage/cli/commands/snapshot.py
index 1193367..27e3ed6 100644
--- a/codesage/cli/commands/snapshot.py
+++ b/codesage/cli/commands/snapshot.py
@@ -98,7 +98,7 @@ def _create_snapshot_data(path, project_name):
 @snapshot.command('create')
 @click.argument('path', type=click.Path(exists=True, dir_okay=True))
 @click.option('--project', '-p', 'project_name_override', help='Override the project name.')
-@click.option('--format', '-f', type=click.Choice(['json', 'python-semantic-digest']), default='json', help='Snapshot format.')
+@click.option('--format', '-f', type=click.Choice(['json', 'python-semantic-digest', 'go-semantic-digest']), default='json', help='Snapshot format.')
 @click.option('--output', '-o', type=click.Path(), default=None, help='Output file path.')
 @click.option('--compress', is_flag=True, help='Enable compression.')
 @click.option('--language', '-l', type=click.Choice(['python', 'go', 'shell', 'java', 'auto']), default='auto', help='Language to analyze.')
@@ -110,7 +110,7 @@ def create(ctx, path, project_name_override, format, output, compress, language)
     try:
         root_path = Path(path)
 
-        if format == 'python-semantic-digest':
+        if format in ['python-semantic-digest', 'go-semantic-digest']:
             if output is None:
                 output = f"{root_path.name}_{language}_semantic_digest.yaml"
 
@@ -118,47 +118,36 @@ def create(ctx, path, project_name_override, format, output, compress, language)
             builder = None
 
             if language == 'auto':
-                # We cannot easily auto-detect here without merging multiple snapshots logic which is in scan.py
-                # For now, we will fail or fallback to scanning all supported languages and picking one or errors.
-                # However, reusing logic from scan.py might be better.
-                # But to keep it simple and since scan.py does the heavy lifting for scanning,
-                # we might just recommend using scan command for multi-language.
-                # But the task requires snapshot command update too.
-
-                # Let's implement basic single-builder detection or multi-builder if possible.
-                # Reusing logic from scan.py is hard because scan.py logic is not exported nicely.
-                # Let's just check extensions and pick the first found or error if multiple?
-                # Or assume user passes specific language if they want specific digest.
-                # But let's try to support 'auto' by picking the most prominent language or just python if ambiguous.
-
-                # Better approach: Import detection logic from scan.py if I move it to a utility.
-                # I defined detect_languages in scan.py, I should have put it in utils.
-
-                # For now, I'll support 'java' explicitly and handle 'auto' minimally.
-                click.echo("Auto detection for snapshot create is partial. Please specify language for best results.")
-                # Simple check
-                if list(root_path.rglob("*.java")):
-                    language = "java"
-                elif list(root_path.rglob("*.py")):
-                    language = "python"
-                elif list(root_path.rglob("*.go")):
-                    language = "go"
-                elif list(root_path.rglob("*.sh")):
-                    language = "shell"
+                if format == 'python-semantic-digest':
+                    language = 'python'
+                elif format == 'go-semantic-digest':
+                    language = 'go'
                 else:
-                    click.echo("Could not auto-detect language.", err=True)
-                    return
-
-            if language == 'python':
+                    # Fallback for auto-detection if format doesn't imply language
+                    if list(root_path.rglob("*.py")):
+                        language = "python"
+                    elif list(root_path.rglob("*.go")):
+                        language = "go"
+                    elif list(root_path.rglob("*.java")):
+                        language = "java"
+                    elif list(root_path.rglob("*.sh")):
+                        language = "shell"
+                    else:
+                        click.echo("Could not auto-detect language.", err=True)
+                        return
+
+            if language == 'python' and format == 'python-semantic-digest':
                 builder = PythonSemanticSnapshotBuilder(root_path, config)
-            elif language == 'go':
+            elif language == 'go' and format == 'go-semantic-digest':
                 builder = GoSemanticSnapshotBuilder(root_path, config)
+            # Preserve other language builders for future use, but they won't be triggered
+            # by the current format options.
             elif language == 'shell':
                 builder = ShellSemanticSnapshotBuilder(root_path, config)
             elif language == 'java':
                 builder = JavaSemanticSnapshotBuilder(root_path, config)
             else:
-                click.echo(f"Unsupported language: {language}", err=True)
+                click.echo(f"Unsupported language/format combination: {language}/{format}", err=True)
                 return
 
             project_snapshot = builder.build()
@@ -240,10 +229,15 @@ def show(version, project):
 
 @snapshot.command('cleanup')
 @click.option('--project', '-p', required=True, help='The name of the project.')
+@click.option('--keep', type=int, default=None, help='Number of recent snapshots to keep.')
 @click.option('--dry-run', is_flag=True, help='Show which snapshots would be deleted.')
-def cleanup(project, dry_run):
+def cleanup(project, keep, dry_run):
     """Clean up old snapshots for a project."""
-    manager = SnapshotVersionManager(SNAPSHOT_DIR, project, DEFAULT_SNAPSHOT_CONFIG['snapshot'])
+    config_override = DEFAULT_SNAPSHOT_CONFIG['snapshot'].copy()
+    if keep is not None:
+        config_override['versioning']['max_versions'] = keep
+
+    manager = SnapshotVersionManager(SNAPSHOT_DIR, project, config_override)
 
     if dry_run:
         index = manager._load_index()
@@ -262,5 +256,5 @@ def cleanup(project, dry_run):
         for s in expired_snapshots:
             click.echo(f"- {s['version']}")
     else:
-        manager.cleanup_expired_snapshots()
-        click.echo(f"Expired snapshots for project '{project}' have been cleaned up.")
+        deleted_count = manager.cleanup_expired_snapshots()
+        click.echo(f"Cleaned up {deleted_count} expired snapshots for project '{project}'.")
diff --git a/codesage/semantic_digest/go_snapshot_builder.py b/codesage/semantic_digest/go_snapshot_builder.py
index d1143f0..c86b71c 100644
--- a/codesage/semantic_digest/go_snapshot_builder.py
+++ b/codesage/semantic_digest/go_snapshot_builder.py
@@ -1,146 +1,360 @@
 from __future__ import annotations
 import re
-from datetime import datetime, timezone
+import os
+import json
+import tempfile
+import shutil
+import subprocess
+from collections import defaultdict
 from pathlib import Path
-from typing import List
+from typing import List, Dict, Any
 
 from codesage.semantic_digest.base_builder import BaseLanguageSnapshotBuilder, SnapshotConfig
-from codesage.snapshot.models import (
-    ProjectSnapshot,
-    FileSnapshot,
-    FileMetrics,
-    SnapshotMetadata,
-    DependencyGraph,
+from codesage.snapshot.models import FileSnapshot
+
+GO_AST_PARSER_SRC = r"""
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"os"
+	"strings"
 )
-from codesage.analyzers.go_parser import GoParser
+
+// 压缩键名结构定义
+type Node struct {
+	Name       string      `json:"n,omitempty"`  // Name
+	Type       string      `json:"t,omitempty"`  // Type / Signature
+	Params     string      `json:"p,omitempty"`  // Params
+	Returns    string      `json:"r,omitempty"`  // Returns
+	Receiver   string      `json:"rc,omitempty"` // Receiver
+	Fields     []*Node     `json:"fd,omitempty"` // Fields
+	Methods    []*Node     `json:"md,omitempty"` // Interface Methods
+	Complexity int         `json:"cx,omitempty"` // Cyclomatic Complexity
+	IsGeneric  bool        `json:"gn,omitempty"` // Uses Generics
+}
+
+type FileSummary struct {
+	Package   string             `json:"pk"`
+	Imports   []string           `json:"im,omitempty"`
+	Structs   []*Node            `json:"st,omitempty"`
+	Ifaces    []*Node            `json:"if,omitempty"`
+	Funcs     []*Node            `json:"fn,omitempty"`
+	Methods   map[string][]*Node `json:"md,omitempty"` // Key: Receiver Type
+	Vars      []string           `json:"vr,omitempty"`
+	Consts    []string           `json:"cn,omitempty"`
+	Comments  []string           `json:"cm,omitempty"` // Sampled comments
+	Stats     FileStats          `json:"stat"`
+}
+
+type FileStats struct {
+	Goroutines int `json:"gr,omitempty"` // count of 'go func'
+	Channels   int `json:"ch,omitempty"` // count of channel ops
+	Errors     int `json:"er,omitempty"` // count of 'if err != nil'
+}
+
+func main() {
+	if len(os.Args) < 2 {
+		os.Exit(1)
+	}
+	fset := token.NewFileSet()
+	node, err := parser.ParseFile(fset, os.Args[1], nil, parser.ParseComments)
+	if err != nil {
+		os.Exit(1)
+	}
+
+	summary := FileSummary{
+		Package: node.Name.Name,
+		Methods: make(map[string][]*Node),
+	}
+
+	// 提取 Imports
+	for _, imp := range node.Imports {
+		path := strings.Trim(imp.Path.Value, "\"")
+		summary.Imports = append(summary.Imports, path)
+	}
+
+	// 访问 AST
+	ast.Inspect(node, func(n ast.Node) bool {
+		switch x := n.(type) {
+
+		// 统计并发与错误处理
+		case *ast.GoStmt:
+			summary.Stats.Goroutines++
+		case *ast.SendStmt:
+			summary.Stats.Channels++
+		case *ast.IfStmt:
+			// 简单的 heuristic 检测 if err != nil
+			if binExpr, ok := x.Cond.(*ast.BinaryExpr); ok {
+				if x, ok := binExpr.X.(*ast.Ident); ok && x.Name == "err" {
+					summary.Stats.Errors++
+				}
+			}
+
+		case *ast.FuncDecl:
+			fnNode := &Node{
+				Name:       x.Name.Name,
+				Complexity: calcComplexity(x.Body),
+				IsGeneric:  x.Type.TypeParams != nil,
+			}
+			// 参数与返回值签名
+			fnNode.Params, fnNode.Returns = extractSig(x.Type)
+
+			if x.Recv == nil {
+				summary.Funcs = append(summary.Funcs, fnNode)
+			} else {
+				// 方法
+				recvType := formatType(x.Recv.List[0].Type)
+				// 清理指针符号以便分组
+				rawRecv := strings.TrimLeft(recvType, "*")
+				fnNode.Receiver = recvType
+				summary.Methods[rawRecv] = append(summary.Methods[rawRecv], fnNode)
+			}
+
+		case *ast.GenDecl:
+			if x.Tok == token.TYPE {
+				for _, spec := range x.Specs {
+					typeSpec := spec.(*ast.TypeSpec)
+					tNode := &Node{Name: typeSpec.Name.Name}
+
+					// 泛型检测
+					if typeSpec.TypeParams != nil {
+						tNode.IsGeneric = true
+					}
+
+					switch t := typeSpec.Type.(type) {
+					case *ast.StructType:
+						// 提取 Struct 字段 (限制前10个)
+						count := 0
+						for _, field := range t.Fields.List {
+							if count > 10 { break }
+							typeStr := formatType(field.Type)
+							if len(field.Names) == 0 {
+								// 嵌入字段
+								tNode.Fields = append(tNode.Fields, &Node{Type: typeStr})
+							} else {
+								for _, name := range field.Names {
+									tNode.Fields = append(tNode.Fields, &Node{Name: name.Name, Type: typeStr})
+								}
+							}
+							count++
+						}
+						summary.Structs = append(summary.Structs, tNode)
+
+					case *ast.InterfaceType:
+						// 提取 Interface 方法
+						for _, method := range t.Methods.List {
+							if len(method.Names) > 0 {
+								p, r := extractSig(method.Type.(*ast.FuncType))
+								tNode.Methods = append(tNode.Methods, &Node{
+									Name: method.Names[0].Name,
+									Params: p,
+									Returns: r,
+								})
+							}
+						}
+						summary.Ifaces = append(summary.Ifaces, tNode)
+					}
+				}
+			} else if x.Tok == token.VAR {
+				for _, spec := range x.Specs {
+					vSpec := spec.(*ast.ValueSpec)
+					for _, name := range vSpec.Names {
+						summary.Vars = append(summary.Vars, name.Name)
+					}
+				}
+			} else if x.Tok == token.CONST {
+				for _, spec := range x.Specs {
+					cSpec := spec.(*ast.ValueSpec)
+					for _, name := range cSpec.Names {
+						summary.Consts = append(summary.Consts, name.Name)
+					}
+				}
+			}
+		}
+		return true
+	})
+
+	// 通道类型检测补充
+	ast.Inspect(node, func(n ast.Node) bool {
+		if t, ok := n.(*ast.ChanType); ok {
+			_ = t
+			summary.Stats.Channels++ // Count definition of channels too
+		}
+		return true
+	})
+
+	// 注释采样 (取 doc)
+	if len(node.Comments) > 0 {
+		for i, cg := range node.Comments {
+			if i >= 5 { break } // limit
+			txt := strings.TrimSpace(cg.Text())
+			if len(txt) > 5 && !strings.HasPrefix(txt, "TODO") {
+				if len(txt) > 100 { txt = txt[:100] + "..." }
+				summary.Comments = append(summary.Comments, txt)
+			}
+		}
+	}
+
+	b, _ := json.Marshal(summary)
+	fmt.Println(string(b))
+}
+
+// 简单的复杂度计算 (McCabe 简化版)
+func calcComplexity(body *ast.BlockStmt) int {
+	count := 1
+	if body == nil { return count }
+	ast.Inspect(body, func(n ast.Node) bool {
+		switch n.(type) {
+		case *ast.IfStmt, *ast.ForStmt, *ast.RangeStmt, *ast.CaseClause:
+			count++
+		case *ast.BinaryExpr:
+			// 统计 && 和 ||
+			be := n.(*ast.BinaryExpr)
+			if be.Op == token.LAND || be.Op == token.LOR {
+				count++
+			}
+		}
+		return true
+	})
+	return count
+}
+
+// 提取函数签名
+func extractSig(t *ast.FuncType) (params, returns string) {
+	ps := []string{}
+	if t.Params != nil {
+		for _, f := range t.Params.List {
+			typeStr := formatType(f.Type)
+			if len(f.Names) == 0 {
+				ps = append(ps, typeStr)
+			} else {
+				for _, name := range f.Names {
+					ps = append(ps, name.Name+" "+typeStr)
+				}
+			}
+		}
+	}
+	rs := []string{}
+	if t.Results != nil {
+		for _, f := range t.Results.List {
+			rs = append(rs, formatType(f.Type))
+		}
+	}
+	return strings.Join(ps, ","), strings.Join(rs, ",")
+}
+
+// 极简类型格式化
+func formatType(expr ast.Expr) string {
+	switch t := expr.(type) {
+	case *ast.Ident: return t.Name
+	case *ast.StarExpr: return "*" + formatType(t.X)
+	case *ast.SelectorExpr: return formatType(t.X) + "." + t.Sel.Name
+	case *ast.ArrayType: return "[]" + formatType(t.Elt)
+	case *ast.MapType: return "map[" + formatType(t.Key) + "]" + formatType(t.Value)
+	case *ast.InterfaceType: return "interface{}"
+	case *ast.ChanType: return "chan " + formatType(t.Value)
+	default: return "T"
+	}
+}
+"""
 
 class GoSemanticSnapshotBuilder(BaseLanguageSnapshotBuilder):
-    def build(self) -> ProjectSnapshot:
-        files = self._collect_files()
-        file_snapshots: List[FileSnapshot] = [self._build_file_snapshot(path) for path in files]
-
-        dep_graph = DependencyGraph()
-        # Aggregate dependencies from file snapshots
-        internal_pkgs = set()
-        # Assume internal packages start with project name or are relative
-        # Or simpler: internal dependencies are those that match other files' package?
-        # Go imports are full paths.
-
-        # We can map file path to package name if parser extracted package name.
-        # GoParser currently doesn't expose package name directly, let's fix that or infer.
-        # Actually GoParser should extract package name.
-
-        for fs in file_snapshots:
-            if fs.symbols and "imports" in fs.symbols:
-                 for imp in fs.symbols["imports"]:
-                     if "." in imp and not imp.startswith("std/"): # Heuristic
-                         dep_graph.external.append(imp)
-
-        # Deduplicate
-        dep_graph.external = sorted(list(set(dep_graph.external)))
-
-        metadata = SnapshotMetadata(
-            version="1.1",
-            timestamp=datetime.now(timezone.utc),
-            project_name=self.root_path.name,
-            file_count=len(file_snapshots),
-            total_size=sum(p.stat().st_size for p in files),
-            tool_version="0.2.0",
-            config_hash="dummy_hash_v2",
-        )
-
-        return ProjectSnapshot(
-            metadata=metadata,
-            files=file_snapshots,
-            dependencies=dep_graph,
-            languages=["go"],
-            language_stats={"go": {"files": len(file_snapshots)}},
-        )
+    def build(self) -> Dict[str, Any]:
+        has_go = False
+        try:
+            subprocess.check_call(["go", "version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            has_go = True
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            pass
 
-    def _collect_files(self) -> List[Path]:
-        return list(self.root_path.rglob("*.go"))
+        digest = {
+            "root": self.root_path.name, "pkgs": {}, "graph": {}, "meta": {}
+        }
 
-    def _build_file_snapshot(self, file_path: Path) -> FileSnapshot:
-        source_code = file_path.read_text()
-
-        parser = GoParser()
-        parser.parse(source_code)
-
-        functions = parser.extract_functions()
-        structs = parser.extract_structs()
-        interfaces = parser.extract_interfaces()
-        imports = parser.extract_imports()
-        stats = parser.get_stats()
-
-        funcs_data = []
-        for f in functions:
-            d = {
-                "name": f.name,
-                "params": f.params,
-                "return_type": f.return_type,
-                "complexity": f.complexity,
-                "start_line": f.start_line,
-                "end_line": f.end_line
-            }
-            if f.receiver:
-                d["receiver"] = f.receiver
-            if f.decorators:
-                d["decorators"] = f.decorators
-            funcs_data.append(d)
-
-        structs_data = []
-        for s in structs:
-            fields = []
-            for f in s.fields:
-                fields.append({
-                    "name": f.name,
-                    "type": f.type_name,
-                    "kind": f.kind
-                })
-            structs_data.append({
-                "name": s.name,
-                "fields": fields
-            })
-
-        interfaces_data = []
-        for i in interfaces:
-            methods = []
-            for m in i.methods:
-                methods.append({
-                    "name": m.name,
-                    "params": m.params,
-                    "return_type": m.return_type
-                })
-            interfaces_data.append({
-                "name": i.name,
-                "methods": methods
-            })
-
-        imports_data = [i.path for i in imports]
-
-        metrics = FileMetrics(
-            lines_of_code=len(source_code.splitlines()),
-            num_functions=len(functions),
-            num_types=len(structs) + len(interfaces),
-            language_specific={
-                "go": {
-                    "goroutines": stats["goroutines"],
-                    "channels": stats["channels"],
-                    "error_checks": stats["errors"]
+        pkg_map = defaultdict(list)
+        all_files = self._collect_files()
+        total_cx = 0
+        total_err_checks = 0
+
+        for fpath in all_files:
+            data = self._extract_semantics(fpath, has_go)
+            pkg_name = data.get("pk", "unknown")
+            clean_data = {k: v for k, v in data.items() if v}
+            clean_data["f"] = str(fpath.relative_to(self.root_path))
+
+            if "stat" in data:
+                total_err_checks += data["stat"].get("er", 0)
+                clean_data["stat"] = {
+                    "gr": data["stat"].get("gr", 0),
+                    "ch": data["stat"].get("ch", 0),
+                    "er": data["stat"].get("er", 0),
                 }
+
+            if "fn" in data:
+                total_cx += sum(fn.get("cx", 1) for fn in data["fn"])
+
+            pkg_map[pkg_name].append(clean_data)
+
+            deps = {imp for imp in data.get("im", []) if "." in imp}
+            if deps:
+                digest["graph"].setdefault(pkg_name, []).extend(list(deps))
+
+        for pkg, files_data in pkg_map.items():
+            digest["pkgs"][pkg] = {
+                "files": len(files_data),
+                "cx_avg": 0,
+                "contents": files_data
             }
-        )
+            if pkg in digest["graph"]:
+                digest["graph"][pkg] = sorted(list(set(digest["graph"][pkg])))
 
-        symbols = {
-            "functions": funcs_data,
-            "structs": structs_data,
-            "interfaces": interfaces_data,
-            "imports": imports_data
+        digest["meta"] = {
+            "files": len(all_files), "pkgs": len(pkg_map),
+            "total_complexity": total_cx, "error_hotspots": total_err_checks,
+            "strategy": "AST" if has_go else "Regex"
         }
 
-        return FileSnapshot(
-            path=str(file_path.relative_to(self.root_path)),
-            language="go",
-            metrics=metrics,
-            symbols=symbols,
-        )
+        return digest
+
+    def _collect_files(self) -> List[Path]:
+        return list(self.root_path.rglob("*.go"))
+
+    def _extract_semantics(self, file_path: Path, has_go: bool) -> Dict[str, Any]:
+        if has_go:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                parser_src_path = os.path.join(temp_dir, "parser.go")
+                with open(parser_src_path, "w", encoding="utf-8") as f:
+                    f.write(GO_AST_PARSER_SRC)
+
+                parser_bin_path = os.path.join(temp_dir, "parser")
+                try:
+                    build_result = subprocess.run(["go", "build", "-o", parser_bin_path, parser_src_path], capture_output=True, text=True, check=True)
+                    cmd = [parser_bin_path, str(file_path)]
+                    output = subprocess.check_output(cmd, stderr=subprocess.PIPE, timeout=15)
+                    return json.loads(output.decode('utf-8'))
+                except (subprocess.CalledProcessError, subprocess.TimeoutExpired, json.JSONDecodeError) as e:
+                    print(f"AST parsing failed for {file_path}: {e}")
+                    if isinstance(e, subprocess.CalledProcessError):
+                        print(f"Stderr: {e.stderr}")
+                        if hasattr(e, 'stdout'):
+                            print(f"Stdout: {e.stdout}")
+
+        # Fallback to regex
+        content = file_path.read_text(encoding="utf-8", errors="ignore")
+        data = {"pk": "unknown", "im": [], "fn": [], "md": {}, "st": [], "if": [],
+                "stat": {"gr": 0, "ch": 0, "er": 0}}
+        m = re.search(r'^\s*package\s+(\w+)', content, re.MULTILINE)
+        if m: data["pk"] = m.group(1)
+        data["im"] = re.findall(r'import\s+"([^"]+)"', content)
+        data["stat"]["er"] = content.count("if err != nil")
+        return data
+
+    def _build_file_snapshot(self, file_path: Path) -> FileSnapshot:
+        # This method is not used in the new dictionary-based build process,
+        # but it's required to satisfy the abstract base class.
+        pass
diff --git a/codesage/semantic_digest/python_snapshot_builder.py b/codesage/semantic_digest/python_snapshot_builder.py
index 3cc54ad..2ac6d9a 100644
--- a/codesage/semantic_digest/python_snapshot_builder.py
+++ b/codesage/semantic_digest/python_snapshot_builder.py
@@ -1,154 +1,278 @@
 from __future__ import annotations
-from datetime import datetime, timezone
+import ast
+import re
+from collections import defaultdict
 from pathlib import Path
-from typing import List
+from typing import List, Dict, Any
 
 from codesage.analyzers.python_parser import PythonParser
-from codesage.config.risk_baseline import RiskBaselineConfig
-from codesage.config.rules_python_baseline import RulesPythonBaselineConfig
-from codesage.risk.python_complexity import analyze_file_complexity
-from codesage.risk.risk_scorer import score_file_risk, summarize_project_risk
-from codesage.rules.engine import RuleEngine
-from codesage.rules.python_ruleset_baseline import get_python_baseline_rules
-from codesage.snapshot.models import (
-    ProjectSnapshot,
-    FileSnapshot,
-    FileMetrics,
-    SnapshotMetadata,
-    DependencyGraph,
-    ProjectRiskSummary,
-)
-
-class SnapshotConfig(dict):
-    pass
-
 from codesage.semantic_digest.base_builder import BaseLanguageSnapshotBuilder, SnapshotConfig
+from codesage.snapshot.models import FileSnapshot
+
+def calculate_complexity(node):
+    complexity = 1
+    nodes_to_walk = node if isinstance(node, (list, tuple)) else [node]
+
+    def count_control_flow(n):
+        nonlocal complexity
+        if isinstance(n, (ast.If, ast.For, ast.While, ast.AsyncFor, ast.With, ast.AsyncWith)):
+            complexity += 1
+        elif isinstance(n, ast.Try):
+            complexity += len(n.handlers)
+        elif isinstance(n, ast.BoolOp) and isinstance(n.op, (ast.And, ast.Or)):
+            complexity += len(n.values) - 1
+        elif isinstance(n, (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)):
+            for generator in n.generators:
+                complexity += len(generator.ifs)
 
+    for root_node in nodes_to_walk:
+        if root_node:
+            for n in ast.walk(root_node):
+                count_control_flow(n)
+    return complexity
 
 class PythonSemanticSnapshotBuilder(BaseLanguageSnapshotBuilder):
     def __init__(self, root_path: Path, config: SnapshotConfig) -> None:
         super().__init__(root_path, config)
         self.parser = PythonParser()
-        self.risk_config = RiskBaselineConfig.from_defaults()
-        self.rules_config = RulesPythonBaselineConfig.default() # This would be loaded from main config in a real app
+        self.args = {
+            'max_docstring_len': 200,
+            'max_args_len': 5,
+            'max_assign_len': 30,
+            'ccn_threshold': 10,
+            'code_sample_lines': 5,
+        }
 
-    def build(self) -> ProjectSnapshot:
+    def build(self) -> Dict[str, Any]:
         files = self._collect_files()
+        digest = {
+            "root": str(self.root_path),
+            "type": "python",
+            "files": [],
+            "modules": defaultdict(lambda: {
+                "f": [], "im": [], "fim": defaultdict(list), "cl": [], "fn": [], "md": {},
+                "dc": set(), "ds": [], "cv": [], "cl_attr": [], "stat": {},
+            }),
+            "deps": defaultdict(set),
+            "sum": {},
+        }
+
+        all_imports = set()
+        total_ccn = 0
+
+        for path in files:
+            rel_path = str(path.relative_to(self.root_path))
+            digest["files"].append(rel_path)
+
+            semantics = self._extract_semantics(path)
+
+            module_name = rel_path.replace(".py", "").replace("/", ".")
+            if module_name.endswith(".__init__"):
+                module_name = module_name[:-9]
+
+            mod_entry = digest["modules"][module_name]
+            mod_entry["f"].append(rel_path)
+            mod_entry["im"].extend(semantics.get("im", []))
+            mod_entry["cv"].extend(semantics.get("cv", []))
+            mod_entry["cl_attr"].extend(semantics.get("cl_attr", []))
 
-        # In a real scenario, this would be populated by a dependency analyzer
-        self.dependency_info = {str(f.relative_to(self.root_path)): [] for f in files}
-
-        file_snapshots = [self._build_file_snapshot(file_path) for file_path in files]
-        dep_graph = self._build_dependency_graph(file_snapshots)
-        project_risk_summary = self._build_project_risk_summary(file_snapshots)
-
-        metadata = SnapshotMetadata(
-            version="1.1", # Version bump for new features
-            timestamp=datetime.now(timezone.utc),
-            project_name=self.root_path.name,
-            file_count=len(file_snapshots),
-            total_size=sum(p.stat().st_size for p in files),
-            tool_version="0.2.0",
-            config_hash="dummy_hash_v2",
-        )
-
-        project = ProjectSnapshot(
-            metadata=metadata,
-            files=file_snapshots,
-            dependencies=dep_graph,
-            risk_summary=project_risk_summary,
-        )
-
-        # Run the rule engine as the final step
-        if self.rules_config.enabled:
-            rules = get_python_baseline_rules(self.rules_config)
-            engine = RuleEngine(rules=rules)
-            project = engine.run(project, self.rules_config)
-
-        return project
+            for imp in semantics.get("im", []):
+                all_imports.add(imp["n"])
+            for module, items in semantics.get("fim", {}).items():
+                mod_entry["fim"][module].extend(items)
+                all_imports.add(module)
+
+            mod_entry["cl"].extend(semantics.get("cl", []))
+            mod_entry["fn"].extend(semantics.get("fn", []))
+            for class_name, methods in semantics.get("md", {}).items():
+                mod_entry["md"].setdefault(class_name, []).extend(methods)
+
+            mod_entry["dc"].update(semantics.get("dc", []))
+            mod_entry["ds"].extend(semantics.get("ds", []))
+            mod_entry["stat"].update(semantics.get("stat", {}))
+
+            total_ccn += sum(f.get("cx", 1) for f in semantics.get("fn", []))
+            total_ccn += sum(m.get("cx", 1) for methods in semantics.get("md", {}).values() for m in methods)
+
+            for imp in semantics.get("im", []):
+                if not imp["n"].startswith(("sys", "os", "re")):
+                    digest["deps"][module_name].add(imp["n"])
+            for module, _ in semantics.get("fim", {}).items():
+                if module and not module.startswith(("sys", "os", "re")):
+                    digest["deps"][module_name].add(module)
+
+        self._finalize_digest(digest, total_ccn, all_imports)
+        return digest
 
     def _collect_files(self) -> List[Path]:
-        return list(self.root_path.rglob("*.py"))
+        return [p for p in self.root_path.rglob("*.py") if not any(part.startswith('.') or part in ('__pycache__', 'venv') for part in p.parts)]
+
+    def _extract_semantics(self, file_path: Path) -> Dict[str, Any]:
+        try:
+            source_code = file_path.read_text(encoding="utf-8")
+            tree = ast.parse(source_code)
+            extractor = PythonSemanticExtractor(self.args)
+            extractor.set_content_lines(source_code.splitlines())
+            extractor.visit(tree)
+            info = extractor.info
+            info["stat"]["io"] = dict(info["stat"]["io"])
+            return info
+        except (SyntaxError, UnicodeDecodeError):
+            return {}
+
+    def _finalize_digest(self, digest: Dict[str, Any], total_ccn: int, all_imports: set) -> None:
+        total_functions = 0
+        has_async = False
+        for module_name, data in digest["modules"].items():
+            data["dc"] = sorted(list(data["dc"]))
+            if data.get("stat", {}).get("async", 0) > 0:
+                has_async = True
+            all_cx = [f.get("cx", 1) for f in data["fn"]]
+            all_cx.extend(m.get("cx", 1) for methods in data["md"].values() for m in methods)
+            count_functions = len(data["fn"]) + sum(len(m) for m in data["md"].values())
+            total_functions += count_functions
+            avg_ccn = round(sum(all_cx) / max(count_functions, 1), 1)
+            high_ccn_count = sum(1 for cx in all_cx if cx >= self.args['ccn_threshold'])
+
+            summary_parts = []
+            if data["cl"]: summary_parts.append(f"CLS:{len(data['cl'])}")
+            if count_functions > 0:
+                summary_parts.append(f"FN:{count_functions}")
+                summary_parts.append(f"AVG_CX:{avg_ccn}")
+                if high_ccn_count > 0: summary_parts.append(f"HIGH_CX:{high_ccn_count}")
+            data["sm"] = "；".join(summary_parts) if summary_parts else "Python Module"
+            data["fim"] = dict(data["fim"])
+
+        digest["modules"] = dict(digest["modules"])
+        digest["deps"] = {mod: sorted(list(deps)) for mod, deps in digest["deps"].items()}
+
+        std_libs = {"os", "sys", "re", "json", "time", "logging"}
+        project_pkgs = set(digest["modules"].keys())
+        tech_stack = sorted(list((all_imports - std_libs) - project_pkgs))
+
+        digest["sum"] = {
+            "mod_count": len(digest["modules"]),
+            "cl_count": sum(len(m["cl"]) for m in digest["modules"].values()),
+            "fn_count": total_functions,
+            "file_count": len(digest["files"]),
+            "total_ccn": total_ccn,
+            "tech_stack": tech_stack[:10],
+            "config_files": [],
+            "has_async": has_async,
+            "uses_type_hints": False,
+        }
+        digest["root"] = str(self.root_path.resolve())
+
 
     def _build_file_snapshot(self, file_path: Path) -> FileSnapshot:
-        source_code = file_path.read_text()
-        self.parser.parse(source_code)
-
-        functions = self.parser.extract_functions()
-        classes = self.parser.extract_classes()
-        variables = self.parser.extract_variables()
-        imports = self.parser.extract_imports()
-
-        complexity_results = analyze_file_complexity(source_code, self.risk_config.threshold_complexity_high)
-
-        # Create a map of function name to complexity
-        if complexity_results:
-            complexity_map = {
-                f.name: f.complexity for f in complexity_results.functions
-            }
-            for func in functions:
-                func.cyclomatic_complexity = complexity_map.get(func.name, 1)
-                # Sample code if complexity is high
-                if func.cyclomatic_complexity >= self.risk_config.threshold_complexity_high:
-                    # Very basic sampling: first 5 lines of the function
-                    # Ideally we would use the parser's start_line/end_line to slice the source
-                    # But we don't have easy access to line-based source here without splitting
-                    lines = source_code.splitlines()
-                    start = func.start_line
-                    end = min(func.end_line, start + 5)
-                    func.value = "\n".join(lines[start:end]) # Store sample in 'value' generic field
-
-        fan_in, fan_out = self._calculate_fan_in_out(str(file_path.relative_to(self.root_path)))
-
-        metrics = FileMetrics(
-            lines_of_code=complexity_results.loc if complexity_results else 0,
-            num_functions=len(functions),
-            num_types=len(classes),
-            language_specific={
-                "python": {
-                    "num_classes": len(classes),
-                    "num_methods": sum(len(c.methods) for c in classes),
-                    "has_async": any(f.is_async for f in functions) or any(m.is_async for c in classes for m in c.methods),
-                    "uses_type_hints": False,  # Placeholder
-                    "max_cyclomatic_complexity": complexity_results.max_cyclomatic_complexity if complexity_results else 0,
-                    "avg_cyclomatic_complexity": complexity_results.avg_cyclomatic_complexity if complexity_results else 0.0,
-                    "high_complexity_functions": complexity_results.high_complexity_functions if complexity_results else 0,
-                    "fan_in": fan_in,
-                    "fan_out": fan_out,
-                }
-            }
-        )
-
-        file_risk = score_file_risk(metrics, self.risk_config)
-
-        symbols = {
-            "classes": [c.model_dump() for c in classes],
-            "functions": [f.model_dump() for f in functions],
-            "variables": [v.model_dump() for v in variables],
-            "imports": [i.model_dump() for i in imports],
-            "functions_detail": [f.model_dump() for f in functions], # For richer rule context
+        # This method is not used in the new dictionary-based build process,
+        # but it's required to satisfy the abstract base class.
+        pass
+
+class PythonSemanticExtractor(ast.NodeVisitor):
+    def __init__(self, args):
+        self.args = args
+        self.info = {
+            "im": [], "fim": defaultdict(list), "cl": [], "fn": [], "md": {}, "ds": [], "cv": [],
+            "cl_attr": [], "dc": [], "th": [],
+            "stat": {"async": 0, "th": 0, "io": defaultdict(int), "err": {"total": 0, "generic": 0}},
+        }
+        self.current_class = None
+        self.content_lines = []
+
+    def set_content_lines(self, lines):
+        self.content_lines = lines
+
+    def visit_Import(self, node):
+        for alias in node.names:
+            self.info["im"].append({"n": alias.name, "ln": node.lineno})
+        self.generic_visit(node)
+
+    def visit_ImportFrom(self, node):
+        module = node.module or ""
+        items = [{"n": alias.name, "ln": node.lineno} for alias in node.names]
+        self.info["fim"].setdefault(module, []).extend(items)
+        self.generic_visit(node)
+
+    def visit_ClassDef(self, node):
+        class_info = {
+            "n": node.name, "ln": node.lineno,
+            "bs": [self._get_name(base) for base in node.bases],
+            "dc": [self._get_name(dec) for dec in node.decorator_list], "attrs": []
         }
+        for item in node.body:
+            if isinstance(item, ast.Assign):
+                value_repr = self._get_annotation(item.value, max_len=self.args['max_assign_len'])
+                for target in item.targets:
+                    if isinstance(target, ast.Name):
+                        class_info["attrs"].append({"n": target.id, "ln": target.lineno, "val": value_repr})
+            elif isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                break
+        self.info["cl"].append(class_info)
+        old_class = self.current_class
+        self.current_class = node.name
+        self.generic_visit(node)
+        self.current_class = old_class
+
+    def _process_function_or_method(self, node):
+        ccn = calculate_complexity(node.body)
+        args_list = [arg.arg for arg in node.args.args]
+        if len(args_list) > self.args['max_args_len']:
+            args_list = args_list[:self.args['max_args_len']] + ["..."]
+
+        func_info = {
+            "n": node.name, "ln": node.lineno, "cx": ccn, "args": args_list,
+            "dc": [self._get_name(dec) for dec in node.decorator_list],
+            "ret": self._get_annotation(node.returns),
+            "async": isinstance(node, ast.AsyncFunctionDef),
+        }
+
+        if ccn >= self.args['ccn_threshold'] and self.content_lines:
+            start_line = node.lineno
+            end_line = getattr(node, 'end_lineno', start_line + 5)
+            sample_end = min(end_line, start_line + self.args['code_sample_lines'])
+            sample_lines = self.content_lines[start_line - 1 : sample_end]
+            if sample_lines:
+                indent = len(sample_lines[0]) - len(sample_lines[0].lstrip())
+                func_info["sample"] = [line[indent:].rstrip() for line in sample_lines]
+
+        if self.current_class:
+            self.info["md"].setdefault(self.current_class, []).append(func_info)
+        else:
+            self.info["fn"].append(func_info)
+
+    def visit_FunctionDef(self, node):
+        self._process_function_or_method(node)
+        self.generic_visit(node)
+
+    def visit_AsyncFunctionDef(self, node):
+        self._process_function_or_method(node)
+        self.generic_visit(node)
+
+    def visit_Assign(self, node):
+        if self.current_class is None:
+            value_repr = self._get_annotation(node.value, max_len=self.args['max_assign_len'])
+            for target in node.targets:
+                if isinstance(target, ast.Name):
+                    name = target.id
+                    is_constant = name.isupper() and (name.replace('_', '').isalnum())
+                    self.info["cv"].append({
+                        "n": name, "ln": target.lineno, "const": is_constant, "val": value_repr,
+                    })
+        self.generic_visit(node)
+
+    def _get_name(self, node):
+        if isinstance(node, ast.Name): return node.id
+        if isinstance(node, ast.Attribute):
+            base = self._get_name(node.value)
+            return f"{base}.{node.attr}" if base else node.attr
+        return ""
 
-        return FileSnapshot(
-            path=str(file_path.relative_to(self.root_path)),
-            language="python",
-            metrics=metrics,
-            symbols=symbols,
-            risk=file_risk,
-        )
-
-    def _build_dependency_graph(self, file_snapshots: List[FileSnapshot]) -> DependencyGraph:
-        # Placeholder implementation
-        return DependencyGraph(internal=[], external=[])
-
-    def _calculate_fan_in_out(self, file_path: str) -> (int, int):
-        fan_out = len(self.dependency_info.get(file_path, []))
-        fan_in = 0
-        for _, dependencies in self.dependency_info.items():
-            if file_path in dependencies:
-                fan_in += 1
-        return fan_in, fan_out
-
-    def _build_project_risk_summary(self, file_snapshots: List[FileSnapshot]) -> ProjectRiskSummary:
-        file_risks = {fs.path: fs.risk for fs in file_snapshots if fs.risk}
-        return summarize_project_risk(file_risks)
+    def _get_annotation(self, node, max_len=50):
+        if node is None: return None
+        if hasattr(ast, "unparse"):
+            try:
+                rep = ast.unparse(node).strip()
+                return rep[:max_len] + "..." if len(rep) > max_len else rep
+            except Exception: pass
+        return str(type(node).__name__)
diff --git a/codesage/snapshot/versioning.py b/codesage/snapshot/versioning.py
index 2420c69..bc514a2 100644
--- a/codesage/snapshot/versioning.py
+++ b/codesage/snapshot/versioning.py
@@ -83,47 +83,59 @@ def _update_index(self, snapshot_path: str, metadata: SnapshotMetadata):
         self._save_index(index)
 
     def _get_expired_snapshots(self, index: List[Dict[str, Any]], now: datetime) -> List[Dict[str, Any]]:
-        """Identifies expired snapshots."""
-        valid_snapshots = []
-        for s in index:
-            try:
-                ts = datetime.fromisoformat(s["timestamp"])
-                if ts.tzinfo is None:
-                    ts = ts.replace(tzinfo=timezone.utc)
-                if now - ts <= timedelta(days=self.retention_days):
-                    valid_snapshots.append(s)
-            except ValueError:
-                # Skip malformed timestamps
-                continue
-
-        if len(valid_snapshots) > self.max_versions:
-            valid_snapshots = sorted(
-                valid_snapshots, key=lambda s: s["timestamp"], reverse=True
-            )[:self.max_versions]
-
-        valid_versions = {s["version"] for s in valid_snapshots}
-        return [s for s in index if s["version"] not in valid_versions]
-
-    def cleanup_expired_snapshots(self):
-        """Removes expired snapshots based on retention days and max versions."""
+        """Identifies expired snapshots based on retention policies."""
+
+        def parse_timestamp(ts_str):
+            ts = datetime.fromisoformat(ts_str)
+            if ts.tzinfo is None:
+                return ts.replace(tzinfo=timezone.utc)
+            return ts
+
+        try:
+            sorted_snapshots = sorted(
+                index,
+                key=lambda s: parse_timestamp(s["timestamp"]),
+                reverse=True
+            )
+        except (ValueError, TypeError):
+            return []
+
+        kept_snapshots = sorted_snapshots[:self.max_versions]
+
+        kept_by_date = {
+            s['version'] for s in kept_snapshots
+            if (now - parse_timestamp(s["timestamp"])) <= timedelta(days=self.retention_days)
+        }
+
+        return [s for s in index if s["version"] not in kept_by_date]
+
+    def cleanup_expired_snapshots(self) -> int:
+        """Removes expired snapshots and returns the count of deleted files."""
         index = self._load_index()
         if not index:
-            return
+            return 0
 
         now = datetime.now(timezone.utc)
         expired_snapshots = self._get_expired_snapshots(index, now)
 
         if not expired_snapshots:
-            return
+            return 0
 
         expired_versions = {s["version"] for s in expired_snapshots}
         valid_snapshots = [s for s in index if s["version"] not in expired_versions]
 
+        deleted_count = 0
         for snapshot_data in expired_snapshots:
-            if os.path.exists(snapshot_data["path"]):
-                os.remove(snapshot_data["path"])
+            snapshot_path = snapshot_data.get("path")
+            if snapshot_path and os.path.exists(snapshot_path):
+                try:
+                    os.remove(snapshot_path)
+                    deleted_count += 1
+                except OSError:
+                    pass
 
         self._save_index(valid_snapshots)
+        return deleted_count
 
     def list_snapshots(self) -> List[Dict[str, Any]]:
         """Lists all managed snapshots from the index."""
diff --git a/codesage/snapshot/yaml_generator.py b/codesage/snapshot/yaml_generator.py
index 0ccc7b5..a6aee9f 100644
--- a/codesage/snapshot/yaml_generator.py
+++ b/codesage/snapshot/yaml_generator.py
@@ -13,11 +13,14 @@ def generate(self, analysis_results: List[Dict[str, Any]]) -> ProjectSnapshot:
             return analysis_results[0]
         raise NotImplementedError("Direct generation from analysis_results is not supported in this workflow.")
 
-    def export(self, snapshot: ProjectSnapshot, output_path: Path) -> None:
-        """Exports the ProjectSnapshot to a YAML file."""
-        # Use Pydantic's serialization which will include all fields by default,
-        # including the new `issues` and `issues_summary` fields.
-        data = snapshot.model_dump(mode="json", exclude_none=True)
+    def export(self, snapshot: Any, output_path: Path) -> None:
+        """Exports the ProjectSnapshot or a dictionary to a YAML file."""
+        if isinstance(snapshot, ProjectSnapshot):
+            data = snapshot.model_dump(mode="json", exclude_none=True)
+        elif isinstance(snapshot, dict):
+            data = snapshot
+        else:
+            raise TypeError("Unsupported snapshot type for YAML export")
 
         with open(output_path, "w", encoding="utf-8") as f:
             yaml.safe_dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
diff --git a/examples/go_test/main.go b/examples/go_test/main.go
new file mode 100644
index 0000000..43732cd
--- /dev/null
+++ b/examples/go_test/main.go
@@ -0,0 +1,24 @@
+package main
+
+import "fmt"
+
+type Greeter interface {
+    Greet()
+}
+
+type Person struct {
+    Name string
+}
+
+func (p Person) Greet() {
+    fmt.Printf("Hello, %s!\n", p.Name)
+}
+
+func NewPerson(name string) Person {
+    return Person{Name: name}
+}
+
+func main() {
+    p := NewPerson("world")
+    p.Greet()
+}
diff --git a/examples/python_test/main.py b/examples/python_test/main.py
new file mode 100644
index 0000000..c088b85
--- /dev/null
+++ b/examples/python_test/main.py
@@ -0,0 +1,13 @@
+class Greeter:
+    def __init__(self, name):
+        self.name = name
+
+    def greet(self):
+        print(f"Hello, {self.name}!")
+
+def main():
+    greeter = Greeter("world")
+    greeter.greet()
+
+if __name__ == "__main__":
+    main()
diff --git a/go_test_codesage.yaml b/go_test_codesage.yaml
new file mode 100644
index 0000000..595b2e3
--- /dev/null
+++ b/go_test_codesage.yaml
@@ -0,0 +1,8 @@
+root: go_test
+pkgs:
+  main:
+    files:
+    - main.go
+meta:
+  files: 1
+  pkgs: 1
diff --git a/go_test_script.yaml b/go_test_script.yaml
new file mode 100644
index 0000000..595b2e3
--- /dev/null
+++ b/go_test_script.yaml
@@ -0,0 +1,8 @@
+root: go_test
+pkgs:
+  main:
+    files:
+    - main.go
+meta:
+  files: 1
+  pkgs: 1
diff --git a/python_test_codesage.yaml b/python_test_codesage.yaml
new file mode 100644
index 0000000..52ccf96
--- /dev/null
+++ b/python_test_codesage.yaml
@@ -0,0 +1,70 @@
+root: /app/examples/python_test
+type: python
+files:
+- main.py
+modules:
+  main:
+    f:
+    - main.py
+    im: []
+    fim: {}
+    cl:
+    - n: Greeter
+      ln: 1
+      bs: []
+      dc: []
+      attrs: []
+    fn:
+    - n: main
+      ln: 8
+      cx: 1
+      args: []
+      dc: []
+      ret: null
+      async: false
+    md:
+      Greeter:
+      - n: __init__
+        ln: 2
+        cx: 1
+        args:
+        - self
+        - name
+        dc: []
+        ret: null
+        async: false
+      - n: greet
+        ln: 5
+        cx: 1
+        args:
+        - self
+        dc: []
+        ret: null
+        async: false
+    dc: []
+    ds: []
+    cv:
+    - n: greeter
+      ln: 9
+      const: false
+      val: Greeter('world')
+    cl_attr: []
+    stat:
+      async: 0
+      th: 0
+      io: {}
+      err:
+        total: 0
+        generic: 0
+    sm: CLS:1；FN:3；AVG_CX:1.0
+deps: {}
+sum:
+  mod_count: 1
+  cl_count: 1
+  fn_count: 3
+  file_count: 1
+  total_ccn: 3
+  tech_stack: []
+  config_files: []
+  has_async: false
+  uses_type_hints: false
diff --git a/python_test_script.yaml b/python_test_script.yaml
new file mode 100644
index 0000000..1ec837c
--- /dev/null
+++ b/python_test_script.yaml
@@ -0,0 +1,71 @@
+root: /app/examples/python_test
+type: python
+files:
+- main.py
+modules:
+  main:
+    f:
+    - main.py
+    im: []
+    fim: !!python/object/apply:collections.defaultdict
+    - !!python/name:builtins.list ''
+    cl:
+    - n: Greeter
+      ln: 1
+      bs: []
+      dc: []
+      attrs: []
+    fn:
+    - n: main
+      ln: 8
+      cx: 1
+      args: []
+      dc: []
+      ret: null
+      async: false
+    md:
+      Greeter:
+      - n: __init__
+        ln: 2
+        cx: 1
+        args:
+        - self
+        - name
+        dc: []
+        ret: null
+        async: false
+      - n: greet
+        ln: 5
+        cx: 1
+        args:
+        - self
+        dc: []
+        ret: null
+        async: false
+    dc: []
+    ds: []
+    cv:
+    - n: greeter
+      ln: 9
+      const: false
+      val: Greeter('world')
+    cl_attr: []
+    stat:
+      async: 0
+      th: 0
+      io: {}
+      err:
+        total: 0
+        generic: 0
+    sm: CLS:1；FN:3；AVG_CX:1.0
+deps: {}
+sum:
+  mod_count: 1
+  cl_count: 1
+  fn_count: 3
+  file_count: 1
+  total_ccn: 3
+  config_files: []
+  tech_stack: []
+  has_async: false
+  uses_type_hints: false
diff --git a/semantic-snapshot/py-semantic-snapshot-v3.py b/semantic-snapshot/py-semantic-snapshot-v3.py
index 041e744..b804861 100644
--- a/semantic-snapshot/py-semantic-snapshot-v3.py
+++ b/semantic-snapshot/py-semantic-snapshot-v3.py
@@ -700,17 +700,29 @@ def generate_semantic_digest(repo_path, output_path, args):
         "uses_type_hints": False, 
     }
 
+    # Convert defaultdict to dict for clean YAML output
+    def convert_defaultdict_to_dict(d):
+        if isinstance(d, defaultdict):
+            d = {k: convert_defaultdict_to_dict(v) for k, v in d.items()}
+        elif isinstance(d, dict):
+            return {k: convert_defaultdict_to_dict(v) for k, v in d.items()}
+        elif isinstance(d, list):
+            return [convert_defaultdict_to_dict(i) for i in d]
+        return d
+
+    final_digest = convert_defaultdict_to_dict(digest)
+
     output_path = ensure_unicode(output_path)
     try:
         with open(output_path, "w", encoding="utf-8") as f:
-            yaml_content = yaml.dump(
-                digest,
+            yaml.dump(
+                final_digest,
+                f,
                 allow_unicode=True,
                 default_flow_style=False,
                 sort_keys=False,
                 width=120,
             )
-            f.write(yaml_content)
 
         print("✅ Semantic project digest generated: {}".format(output_path))
         print(
diff --git a/tests/cli/test_snapshot.py b/tests/cli/test_snapshot.py
new file mode 100644
index 0000000..7f505f0
--- /dev/null
+++ b/tests/cli/test_snapshot.py
@@ -0,0 +1,241 @@
+
+import os
+import shutil
+import subprocess
+import sys
+import yaml
+import json
+from pathlib import Path
+
+import pytest
+from click.testing import CliRunner
+
+from codesage.cli.main import main
+from codesage.utils.file_utils import read_yaml_file as load_yaml
+
+# Mark all tests in this file as 'e2e'
+pytestmark = pytest.mark.e2e
+
+@pytest.fixture(scope="module")
+def runner():
+    return CliRunner()
+
+@pytest.fixture(scope="module")
+def setup_projects(tmpdir_factory):
+    """Creates temporary Python and Go projects for testing."""
+    base_dir = Path(str(tmpdir_factory.mktemp("projects")))
+
+    # Python Project
+    py_project_dir = base_dir / "py_project"
+    py_project_dir.mkdir()
+    (py_project_dir / "main.py").write_text(
+        "class MyClass:\n"
+        "    def method(self, arg1):\n"
+        "        return arg1 * 2\n\n"
+        "def top_level_func(a, b):\n"
+        "    return a + b\n"
+    )
+
+    # Go Project
+    go_project_dir = base_dir / "go_project"
+    go_project_dir.mkdir()
+    (go_project_dir / "main.go").write_text(
+        "package main\n\n"
+        "import \"fmt\"\n\n"
+        "type Greeter struct {\n"
+        "    Name string\n"
+        "}\n\n"
+        "func (g *Greeter) Greet() {\n"
+        "    fmt.Println(\"Hello,\", g.Name)\n"
+        "}\n\n"
+        "func main() {\n"
+        "    g := Greeter{Name: \"World\"}\n"
+        "    g.Greet()\n"
+        "}\n"
+    )
+
+    return base_dir, py_project_dir, go_project_dir
+
+
+def run_standalone_script(script_path: Path, project_dir: Path) -> dict:
+    """
+    Helper to run the original snapshot scripts. These scripts write their
+    output to a file instead of stdout.
+    """
+    script_abs = script_path.resolve()
+    env = os.environ.copy()
+    env["PYTHONPATH"] = str(Path.cwd())
+
+    # The scripts generate predictable output filenames in the CWD
+    if "py-semantic" in script_abs.name:
+        output_filename = "python_semantic_digest.yaml"
+        command = [sys.executable, str(script_abs), str(project_dir)]
+    elif "go-semantic" in script_abs.name:
+        output_filename = "go_digest.yaml"
+        command = [sys.executable, str(script_abs), str(project_dir)]
+    else:
+        raise ValueError(f"Unknown script type: {script_abs.name}")
+
+    output_filepath = Path.cwd() / output_filename
+    if output_filepath.exists():
+        output_filepath.unlink()
+
+    subprocess.run(
+        command,
+        capture_output=True, # Still capture to see logs if needed
+        text=True,
+        check=True,
+        cwd=Path.cwd(),
+    )
+
+    if not output_filepath.exists():
+        raise FileNotFoundError(f"Script {script_abs.name} did not generate {output_filename}")
+
+    with open(output_filepath, "r", encoding="utf-8") as f:
+        # Use FullLoader to handle Python-specific tags like defaultdict
+        data = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Clean up the generated file
+    output_filepath.unlink()
+
+    return data
+
+
+def test_python_snapshot_consistency(runner: CliRunner, setup_projects):
+    """Verify codesage 'py' format matches the original Python script."""
+    base_dir, py_project_dir, _ = setup_projects
+    script_path = Path("semantic-snapshot/py-semantic-snapshot-v3.py")
+
+    # 1. Run standalone script
+    expected_output = run_standalone_script(script_path, py_project_dir)
+
+    # 2. Run codesage
+    output_file = base_dir / "codesage_py_output.yml"
+    result = runner.invoke(
+        main,
+        [
+            "snapshot", "create",
+            str(py_project_dir),
+            "--format", "python-semantic-digest",
+            "--language", "python",
+            "--output", str(output_file)
+        ],
+        catch_exceptions=False
+    )
+    assert result.exit_code == 0
+    assert output_file.exists()
+    codesage_output = load_yaml(output_file)
+
+    # 3. Compare outputs
+    assert codesage_output["root"] == expected_output["root"]
+    assert "main" in codesage_output["modules"]
+    assert len(codesage_output["modules"]["main"]["cl"]) > 0
+    assert codesage_output["modules"]["main"]["cl"][0]["n"] == "MyClass"
+
+
+def test_go_snapshot_consistency(runner: CliRunner, setup_projects):
+    """Verify codesage 'go' format matches the original Go script."""
+    base_dir, _, go_project_dir = setup_projects
+    script_path = Path("semantic-snapshot/go-semantic-snapshot-v4.py")
+
+    # 1. Run standalone script
+    expected_output = run_standalone_script(script_path, go_project_dir)
+
+    # 2. Run codesage
+    output_file = base_dir / "codesage_go_output.yml"
+    result = runner.invoke(
+        main,
+        [
+            "snapshot", "create",
+            str(go_project_dir),
+            "--format", "go-semantic-digest",
+            "--language", "go",
+            "--output", str(output_file)
+        ],
+        catch_exceptions=False
+    )
+    assert result.exit_code == 0
+    assert output_file.exists()
+    codesage_output = load_yaml(output_file)
+
+    # 3. Compare outputs
+    # Normalize for comparison - e.g., rounding floats if necessary
+    if "pkgs" in codesage_output and "main" in codesage_output["pkgs"]:
+        codesage_output["pkgs"]["main"]["cx_avg"] = round(codesage_output["pkgs"]["main"]["cx_avg"])
+    if "pkgs" in expected_output and "main" in expected_output["pkgs"]:
+         expected_output["pkgs"]["main"]["cx_avg"] = round(expected_output["pkgs"]["main"]["cx_avg"])
+
+    # Check key structures
+    assert codesage_output["root"] == expected_output["root"]
+    assert "main" in codesage_output["pkgs"]
+    assert "main" in expected_output["pkgs"]
+    assert codesage_output["pkgs"]["main"]["files"] == expected_output["pkgs"]["main"]["files"]
+
+    # Check for specific semantic details
+    main_pkg_contents = codesage_output["pkgs"]["main"]["contents"][0]
+    assert "st" in main_pkg_contents, "Structs ('st') not found in Go snapshot"
+    assert len(main_pkg_contents["st"]) > 0, "No structs found in Go snapshot"
+    assert main_pkg_contents["st"][0]["n"] == "Greeter", "Greeter struct not found"
+
+    assert "fn" in main_pkg_contents, "Functions ('fn') not found in Go snapshot"
+    assert len(main_pkg_contents["fn"]) > 0, "No functions found in Go snapshot"
+    assert main_pkg_contents["fn"][0]["n"] == "main", "main function not found"
+
+    assert "md" in main_pkg_contents, "Methods ('md') not found in Go snapshot"
+    assert "Greeter" in main_pkg_contents["md"], "Methods for Greeter struct not found"
+    assert len(main_pkg_contents["md"]["Greeter"]) > 0, "No methods found for Greeter struct"
+    assert main_pkg_contents["md"]["Greeter"][0]["n"] == "Greet", "Greet method not found"
+
+
+def test_default_snapshot_creation(runner: CliRunner, setup_projects, tmp_path):
+    """Test default snapshot creation creates a .json file."""
+    _, py_project_dir, _ = setup_projects
+
+    # Change to a temporary directory to isolate .codesage folder
+    os.chdir(tmp_path)
+
+    result = runner.invoke(
+        main, ["snapshot", "create", str(py_project_dir)], catch_exceptions=False
+    )
+    assert result.exit_code == 0
+
+    snapshot_dir = tmp_path / ".codesage" / "snapshots" / py_project_dir.name
+    assert snapshot_dir.exists()
+
+    # Check for the presence of at least one JSON snapshot file
+    json_files = list(snapshot_dir.glob("*.json"))
+    assert len(json_files) > 0, "No JSON snapshot file was created"
+
+
+def test_snapshot_show_and_cleanup(runner: CliRunner, setup_projects, tmp_path):
+    """Test 'snapshot show' and 'snapshot cleanup' commands."""
+    _, py_project_dir, _ = setup_projects
+    project_name = py_project_dir.name
+
+    os.chdir(tmp_path)
+
+    # 1. Create a few snapshots
+    for _ in range(3):
+        res = runner.invoke(main, ["snapshot", "create", str(py_project_dir), "--project", project_name], catch_exceptions=False)
+        assert res.exit_code == 0
+
+    snapshot_dir = tmp_path / ".codesage" / "snapshots" / project_name
+    # Filter out symlinks and the index file
+    snapshots = [p for p in snapshot_dir.glob("v*.json")]
+    assert len(snapshots) == 3
+
+    # 2. Test 'snapshot show'
+    result = runner.invoke(main, ["snapshot", "show", "--project", project_name, snapshots[0].stem], catch_exceptions=False)
+    assert result.exit_code == 0
+    assert project_name in result.output
+    assert snapshots[0].stem in result.output
+
+    # 3. Test 'snapshot cleanup'
+    result = runner.invoke(main, ["snapshot", "cleanup", "--project", project_name, "--keep", "1"], catch_exceptions=False)
+    assert result.exit_code == 0
+
+    remaining_snapshots = [p for p in snapshot_dir.glob("v*.json")]
+    assert len(remaining_snapshots) == 1
+
+    # Restore original working directory if needed by other tests
+    os.chdir(Path.cwd())
diff --git a/tests/cli/test_snapshot_command.py b/tests/cli/test_snapshot_command.py
deleted file mode 100644
index 32e538f..0000000
--- a/tests/cli/test_snapshot_command.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from click.testing import CliRunner
-from codesage.cli.main import main
-from unittest.mock import patch, MagicMock
-import os
-
-@patch('codesage.cli.commands.snapshot.SnapshotVersionManager')
-def test_snapshot_create(mock_manager):
-    """Test snapshot creation."""
-    runner = CliRunner()
-    instance = mock_manager.return_value
-    instance.save_snapshot.return_value = ".codesage/snapshots/v1.json"
-
-    with runner.isolated_filesystem():
-        os.makedirs("test_project")
-        result = runner.invoke(main, ['snapshot', 'create', 'test_project'])
-
-        assert result.exit_code == 0
-        assert "Snapshot created at .codesage/snapshots/v1.json" in result.output
-
-@patch('codesage.cli.commands.snapshot.SnapshotVersionManager')
-def test_snapshot_list(mock_manager):
-    """Test listing snapshots."""
-    runner = CliRunner()
-    instance = mock_manager.return_value
-    instance.list_snapshots.return_value = [
-        {'version': 'v1', 'timestamp': '2023-01-01T12:00:00'},
-        {'version': 'v2', 'timestamp': '2023-01-02T12:00:00'},
-    ]
-
-    result = runner.invoke(main, ['snapshot', 'list'])
-
-    assert result.exit_code == 0
-    assert "v1" in result.output
-    assert "v2" in result.output