Skip to content
136 changes: 126 additions & 10 deletions pkg/sources/gitlab/gitlab.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,49 @@ type Source struct {
printLegacyJSON bool

projectsPerPage int

// cache of repo URL to project info, used when generating metadata for chunks
repoToProjCache repoToProjectCache
}

type project struct {
id int
name string
owner string
}

type repoToProjectCache struct {
sync.RWMutex

cache map[string]*project
}

func (r *repoToProjectCache) get(repo string) (*project, bool) {
r.RLock()
defer r.RUnlock()
proj, ok := r.cache[repo]
return proj, ok
}

func (r *repoToProjectCache) set(repo string, proj *project) {
r.Lock()
defer r.Unlock()

r.cache[repo] = proj
}

func (r *repoToProjectCache) del(repo string) {
r.Lock()
defer r.Unlock()

delete(r.cache, repo)
}

func (r *repoToProjectCache) clear() {
r.Lock()
defer r.Unlock()

clear(r.cache)
}

// WithCustomContentWriter sets the useCustomContentWriter flag on the source.
Expand Down Expand Up @@ -227,18 +270,26 @@ func (s *Source) Init(ctx context.Context, name string, jobId sources.JobID, sou
SkipArchives: conn.GetSkipArchives(),
Concurrency: concurrency,
SourceMetadataFunc: func(file, email, commit, timestamp, repository, repositoryLocalPath string, line int64) *source_metadatapb.MetaData {
gitlabMetadata := &source_metadatapb.Gitlab{
Commit: sanitizer.UTF8(commit),
File: sanitizer.UTF8(file),
Email: sanitizer.UTF8(email),
Repository: sanitizer.UTF8(repository),
RepositoryLocalPath: sanitizer.UTF8(repositoryLocalPath),
Link: giturl.GenerateLink(repository, commit, file, line),
Timestamp: sanitizer.UTF8(timestamp),
Line: line,
}
proj, ok := s.repoToProjCache.get(repository)
if ok {
gitlabMetadata.ProjectId = int64(proj.id)
gitlabMetadata.ProjectName = proj.name
gitlabMetadata.ProjectOwner = proj.owner
}

return &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Gitlab{
Gitlab: &source_metadatapb.Gitlab{
Commit: sanitizer.UTF8(commit),
File: sanitizer.UTF8(file),
Email: sanitizer.UTF8(email),
Repository: sanitizer.UTF8(repository),
RepositoryLocalPath: sanitizer.UTF8(repositoryLocalPath),
Link: giturl.GenerateLink(repository, commit, file, line),
Timestamp: sanitizer.UTF8(timestamp),
Line: line,
},
Gitlab: gitlabMetadata,
},
}
},
Expand All @@ -247,6 +298,10 @@ func (s *Source) Init(ctx context.Context, name string, jobId sources.JobID, sou
}
s.git = git.NewGit(cfg)

s.repoToProjCache = repoToProjectCache{
cache: make(map[string]*project),
}

return nil
}

Expand Down Expand Up @@ -291,12 +346,22 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, tar
},
}

// Clear the repo to project cache when done.
defer s.repoToProjCache.clear()

if err := s.listProjects(ctx, apiClient, ignoreRepo, reporter); err != nil {
return err
}

} else {
gitlabReposEnumerated.WithLabelValues(s.name).Set(float64(len(repos)))
// ensure project details for specified repos are cached
// this is required to populate metadata during chunking
for _, repo := range repos {
s.ensureProjectInCache(ctx, repo)
// remove project from cache to free up memory after chunking is done
defer s.repoToProjCache.del(repo)
}
}

s.repos = repos
Expand Down Expand Up @@ -548,6 +613,7 @@ func (s *Source) getAllProjectRepos(
}
// Report the unit.
ctx.Logger().V(3).Info("accepting project")
s.repoToProjCache.set(proj.HTTPURLToRepo, gitlabProjectToCacheProject(proj))
unit := git.SourceUnit{Kind: git.UnitRepo, ID: proj.HTTPURLToRepo}
gitlabReposEnumerated.WithLabelValues(s.name).Inc()
projectsWithNamespace = append(projectsWithNamespace, proj.NameWithNamespace)
Expand Down Expand Up @@ -740,6 +806,7 @@ func (s *Source) getAllProjectReposV2(
// report the unit.
projCtx.Logger().V(3).Info("accepting project")

s.repoToProjCache.set(project.HTTPURLToRepo, gitlabProjectToCacheProject(project))
unit := git.SourceUnit{Kind: git.UnitRepo, ID: project.HTTPURLToRepo}
gitlabReposEnumerated.WithLabelValues(s.name).Inc()

Expand Down Expand Up @@ -842,6 +909,7 @@ func (s *Source) getAllProjectReposInGroups(
// report the unit.
projCtx.Logger().V(3).Info("accepting project")

s.repoToProjCache.set(proj.HTTPURLToRepo, gitlabProjectToCacheProject(proj))
unit := git.SourceUnit{Kind: git.UnitRepo, ID: proj.HTTPURLToRepo}
gitlabReposEnumerated.WithLabelValues(s.name).Inc()
projectsWithNamespace = append(projectsWithNamespace, proj.NameWithNamespace)
Expand All @@ -864,6 +932,20 @@ func (s *Source) getAllProjectReposInGroups(
return nil
}

func gitlabProjectToCacheProject(proj *gitlab.Project) *project {
project := &project{
id: proj.ID,
name: proj.NameWithNamespace,
}
if proj.Owner != nil {
project.owner = proj.Owner.Email
if project.owner == "" {
project.owner = proj.Owner.Username
}
}
return project
}

func (s *Source) scanRepos(ctx context.Context, chunksChan chan *sources.Chunk) error {
// If there is resume information available, limit this scan to only the repos that still need scanning.
reposToScan, progressIndexOffset := sources.FilterReposToResume(s.repos, s.GetProgress().EncodedResumeInfo)
Expand Down Expand Up @@ -1126,5 +1208,39 @@ func (s *Source) ChunkUnit(ctx context.Context, unit sources.SourceUnit, reporte
}
}

// ensure project details are cached
// this is required to populate metadata during chunking
s.ensureProjectInCache(ctx, repoURL)
// remove project from cache to free up memory after chunking is done
defer s.repoToProjCache.del(repoURL)

return s.git.ScanRepo(ctx, repo, path, s.scanOptions, reporter)
}

// ensureProjectInCache checks if the project for the given repo URL is in the cache,
// and if not, queries the GitLab API to fetch the project and adds it to the cache.
func (s *Source) ensureProjectInCache(ctx context.Context, repoUrl string) {
// check if project is already in cache
if _, ok := s.repoToProjCache.get(repoUrl); ok {
return
}

// query project and add to cache
apiClient, err := s.newClient()
if err != nil {
ctx.Logger().Error(err, "could not create api client")
return
}
// extract project path from repo URL
// https://gitlab.com/testermctestface/testy.git => testermctestface/testy
repoPath := strings.TrimSuffix(strings.Join(strings.Split(repoUrl, "/")[len(strings.Split(repoUrl, "/"))-2:], "/"), ".git")

proj, _, err := apiClient.Projects.GetProject(repoPath, nil, gitlab.WithContext(ctx))
if err != nil {
ctx.Logger().Error(err, "could not query project metadata", "repo", repoUrl)
return
}

// add to cache
s.repoToProjCache.set(repoUrl, gitlabProjectToCacheProject(proj))
}
133 changes: 133 additions & 0 deletions pkg/sources/gitlab/gitlab_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
"github.com/trufflesecurity/trufflehog/v3/pkg/sourcestest"
)

func TestSource_Scan(t *testing.T) {
Expand Down Expand Up @@ -680,3 +681,135 @@ func TestSource_InclusionGlobbing(t *testing.T) {
})
}
}

func TestSource_Chunks_ProjectDetailsInChunkMetadata(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*30)
defer cancel()

secret, err := common.GetTestSecret(ctx)
if err != nil {
t.Fatal(fmt.Errorf("failed to access secret: %v", err))
}

token := secret.MustGetField("GITLAB_TOKEN")

tests := []struct {
name string
connection *sourcespb.GitLab
}{
{
name: "project details in chunk metadata - No repos configured",
connection: &sourcespb.GitLab{
Credential: &sourcespb.GitLab_Token{
Token: token,
},
},
},
{
name: "project details in chunk metadata - Repo configured",
connection: &sourcespb.GitLab{
Credential: &sourcespb.GitLab_Token{
Token: token,
},
IncludeRepos: []string{"https://gitlab.com/testermctestface/testy.git"},
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {

s := Source{}

conn, err := anypb.New(tt.connection)
if err != nil {
t.Fatal(err)
}

err = s.Init(ctx, tt.name, 0, 0, false, conn, 10)
if err != nil {
t.Errorf("Source.Init() error = %v", err)
return
}
chunksCh := make(chan *sources.Chunk, 1)
go func() {
defer close(chunksCh)
err = s.Chunks(context.Background(), chunksCh)
if err != nil {
t.Errorf("Source.Chunks() error = %v", err)
return
}
}()
gotChunks := false
for gotChunk := range chunksCh {
gotChunks = true
metadata := gotChunk.SourceMetadata.Data.(*source_metadatapb.MetaData_Gitlab)
if metadata.Gitlab.ProjectId == 0 || metadata.Gitlab.ProjectName == "" {
t.Errorf("Source.Chunks() missing project details in chunk metadata: %+v", metadata.Gitlab)
}
}
if !gotChunks {
t.Errorf("0 chunks scanned.")
}
})
}
}

func TestSource_Enumerate_ProjectDetailsInChunkMetadata(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*30)
defer cancel()

secret, err := common.GetTestSecret(ctx)
if err != nil {
t.Fatal(fmt.Errorf("failed to access secret: %v", err))
}

token := secret.MustGetField("GITLAB_TOKEN")

s := Source{}

conn, err := anypb.New(&sourcespb.GitLab{
Credential: &sourcespb.GitLab_Token{
Token: token,
},
})
if err != nil {
t.Fatal(err)
}

err = s.Init(ctx, "project details in chunkmetadata", 0, 0, false, conn, 10)
if err != nil {
t.Errorf("Source.Init() error = %v", err)
return
}
testReporter := sourcestest.TestReporter{}
err = s.Enumerate(ctx, &testReporter)
if err != nil {
t.Errorf("Source.Chunks() error = %v", err)
return
}
chunksCh := make(chan *sources.Chunk, 1)
chanReporter := sources.ChanReporter{Ch: chunksCh}
// Clear cache to force querying project details
s.repoToProjCache.clear()
go func() {
defer close(chunksCh)
for _, unit := range testReporter.Units {
err := s.ChunkUnit(context.Background(), unit, chanReporter)
if err != nil {
t.Errorf("Source.ChunkUnit() error = %v", err)
}
}
}()
gotChunks := false
for gotChunk := range chunksCh {
gotChunks = true
metadata := gotChunk.SourceMetadata.Data.(*source_metadatapb.MetaData_Gitlab)
if metadata.Gitlab.ProjectId == 0 || metadata.Gitlab.ProjectName == "" {
t.Errorf("Source.Chunks() missing project details in chunk metadata: %+v", metadata.Gitlab)
}
}
if !gotChunks {
t.Errorf("0 chunks scanned.")
}
}
Loading