If you want to rewrite the revision history of a git repository to remove secrets or useless big files, you may know the git filter-branch command. This command is hard to use and very slow. There are multiple alternatives to git filter-branch such as git-filter-repo or BFG Repo-Cleaner. But, as a .NET developer, I would like to use .NET to script the history rewrite.
Download .NET: https://dotnet.microsoft.com/download
Create a new project
Shell
dotnet new console
Add the LibGit2Sharp NuGet package
Shell
dotnet add package LibGit2Sharp
The csproj file should look like the following
csproj (MSBuild project file)
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net5.0</TargetFramework>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="LibGit2Sharp" Version="0.26.2" />
</ItemGroup>
</Project>
Configure the repository rewrite
C#
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using LibGit2Sharp;
var gitRepositoryPath = "path/to/git/repository";
using var repository = new Repository(gitRepositoryPath);
RewriteHistoryOptions options = new()
{
PruneEmptyCommits = true,
OnError = ex => Console.WriteLine(ex),
OnSucceeding = () => Console.WriteLine("Succeeding"),
// TODO Rename tags
TagNameRewriter = (string name, bool isAnnotated, string target) =>
{
return name;
},
// TODO Update commit message or author
CommitHeaderRewriter = (Commit commit) =>
{
var info = CommitRewriteInfo.From(commit);
info.Message = info.Message.Replace("old text", "new text");
// info.Committer = ;
// info.Author = ;
return info;
},
// TODO Update the content of a commit
CommitTreeRewriter = (Commit commit) =>
{
var tree = TreeDefinition.From(commit);
foreach (var treeEntry in GetTreeEntries(commit.Tree))
{
// Remove a file
if (treeEntry.Path == "to-remove.txt")
{
tree.Remove(treeEntry.Path);
}
// Rename a file
if (treeEntry.Path == "to-rename.txt")
{
tree.Remove(treeEntry.Path);
tree.Add("new-path.txt", treeEntry);
}
// Edit the content of a file
if (treeEntry.Path == "to-edit.txt")
{
RewriteTextContent(tree, treeEntry,
content => content.Replace("my-old-text", "new-text"));
}
}
return tree;
},
};
// Get the list of commits to rewrite
var commitsToRewrite = repository.Commits.QueryBy(new CommitFilter()
{
FirstParentOnly = false,
IncludeReachableFrom = repository.Head.Tip.Sha,
// You can prevent some commits if needed
// ExcludeReachableFrom = new object[]
// {
// "v1.2.0", // Tag name
// "old_branch", // branch name
// "9f42adbf542f9de2e840b82bce01e0e5801881fc", // Commit sha
// },
SortBy = CommitSortStrategies.Topological | CommitSortStrategies.Reverse,
}).ToArray();
// It seems libgit2sharp cannot rewrite tags when remotes are defined
// Let's remove remotes
repository.Network.Remotes.Remove("origin");
// Rewrite history
repository.Refs.RewriteHistory(options, commitsToRewrite);
Console.WriteLine("Completed");
// ------------------------------------------
// Helper methods
// Recursively get all blob entries from a tree
static IEnumerable<TreeEntry> GetTreeEntries(Tree tree)
{
var result = new List<TreeEntry>();
GetTreeEntries(result, tree);
return result;
static void GetTreeEntries(List<TreeEntry> entries, Tree tree)
{
foreach (var entry in tree)
{
if (entry.TargetType == TreeEntryTargetType.Blob)
{
entries.Add(entry);
}
else if (entry.TargetType == TreeEntryTargetType.Tree)
{
GetTreeEntries(entries, (Tree)entry.Target);
}
}
}
}
void RewriteContent(TreeDefinition treeDefinition,
TreeEntry entry,
Func<byte[], byte[]> rewriter)
{
if (entry.TargetType != TreeEntryTargetType.Blob)
return;
var blob = (Blob)entry.Target;
using var blobStream = blob.GetContentStream();
using var readStream = new MemoryStream();
blobStream.CopyTo(readStream);
var originalContent = readStream.ToArray();
var newContent = rewriter(originalContent);
if (originalContent == newContent)
return;
// remove the current entry and add the new one
treeDefinition.Remove(entry.Path);
using MemoryStream newContentStream = new(newContent);
Blob newBlob = repository.ObjectDatabase.CreateBlob(newContentStream);
treeDefinition.Add(entry.Path, newBlob, entry.Mode);
}
void RewriteTextContent(TreeDefinition treeDefinition,
TreeEntry entry,
Func<string, string> rewriter)
{
RewriteContent(treeDefinition, entry, content =>
{
using var ms = new MemoryStream(content);
using var reader = new StreamReader(ms);
string originalContent = reader.ReadToEnd();
var newContent = rewriter(originalContent);
if (newContent == originalContent)
return content;
return Encoding.UTF8.GetBytes(newContent);
});
}
You can now run the project. This operation can take a few minutes depending on the repository size and the list of rewrite operations.
Shell
dotnet run
The history should be rewritten!
#Searching text in the repository history
We rewriting the history, you may want to validate the content of your history by using tools such as grep to search for specific strings. You'll find some tricky command lines on Stack Overflow. A way I found easier is to output all blobs in a directory. Then, you can use the tools you want on these files to find what you are looking for. It can also be useful to find huge files in the repository.
C#
using System.Collections.Generic;
using System.IO;
using LibGit2Sharp;
var gitRepositoryPath = @"path/to/git/repository";
using var repository = new Repository(gitRepositoryPath);
var outputDirectory = @"path/to/blobs";
Directory.CreateDirectory(outputDirectory);
foreach (var obj in repository.ObjectDatabase)
{
if (obj is Blob blob)
{
var outputBlobPath = Path.Combine(outputDirectory, obj.Sha);
using var outputStream = File.OpenWrite(outputBlobPath);
using var blobStream = blob.GetContentStream();
blobStream.CopyTo(outputStream);
}
}
If you want to get blobs references by a branch or a tag, you can modify the previous code:
C#
using System.Collections.Generic;
using System.IO;
using LibGit2Sharp;
var gitRepositoryPath = @"path/to/git/repository";
using var repository = new Repository(gitRepositoryPath);
var outputDirectory = @"path/to/blobs";
Directory.CreateDirectory(outputDirectory);
var blobs = new HashSet<Blob>();
foreach (var commit in repository.Branches["main"].Commits)
{
GetBlobs(blobs, commit.Tree);
}
foreach(var blob in blobs)
{
var outputBlobPath = Path.Combine(outputDirectory, blob.Sha);
using var outputStream = File.OpenWrite(outputBlobPath);
using var blobStream = blob.GetContentStream();
blobStream.CopyTo(outputStream);
}
static void GetBlobs(ISet<Blob> blobs, Tree tree)
{
foreach (var entry in tree)
{
if (entry.TargetType == TreeEntryTargetType.Blob)
{
blobs.Add((Blob)entry.Target);
}
else if (entry.TargetType == TreeEntryTargetType.Tree)
{
GetBlobs(blobs, (Tree)entry.Target);
}
}
}
After running the previous code, you can use tools such as grep to find text in the revision history.
Shell
dotnet run
grep -R "text" path/to/blobs
#Additional resources
Do you have a question or a suggestion about this post? Contact me!