Compare commits

...

2 Commits

Author SHA1 Message Date
Farid Zakaria
a67c93c240 Improve Git URI handling
Git URI can also support scp style links similar to git itself.

This change augments the function fixGitURL to better handle the scp
style urls through a minimal parser rather than regex which has been
found to be brittle.

* Support for IPV6 added
* New test cases added for fixGitURL
* Clearer documentation on purpose and goal of function
* More `std::string_view` for performance
* Update URL tests

Fixes #5958

Mostly undoes revert 4757487110599bbe9a287ead75741bba5436d52f
Adapted from commit 04ad66af5f
2025-09-01 17:20:40 -04:00
John Ericson
c80805cb61 Make fixGitURL reject file:/foo/bar/ URL
This is a breaking change, but I think a good one. If we had allowed it,
Git/SCP and the rest of our code would have disagreed on what this URL
meant.
2025-09-01 17:20:27 -04:00
3 changed files with 148 additions and 73 deletions

View File

@@ -14,8 +14,8 @@ using HostType = Authority::HostType;
struct FixGitURLParam
{
std::string input;
std::string expected;
std::string_view input;
std::string_view expected;
ParsedURL parsed;
};
@@ -63,6 +63,34 @@ INSTANTIATE_TEST_SUITE_P(
.path = {"", "owner", "repo.git"},
},
},
// SCP-like URL (no user)
FixGitURLParam{
.input = "github.com:owner/repo.git",
.expected = "ssh://github.com/owner/repo.git",
.parsed =
ParsedURL{
.scheme = "ssh",
.authority =
ParsedURL::Authority{
.host = "github.com",
},
.path = {"", "owner", "repo.git"},
},
},
// SCP-like URL (leading slash)
FixGitURLParam{
.input = "github.com:/owner/repo.git",
.expected = "ssh://github.com/owner/repo.git",
.parsed =
ParsedURL{
.scheme = "ssh",
.authority =
ParsedURL::Authority{
.host = "github.com",
},
.path = {"", "owner", "repo.git"},
},
},
// Absolute path (becomes file:)
FixGitURLParam{
.input = "/home/me/repo",
@@ -74,23 +102,10 @@ INSTANTIATE_TEST_SUITE_P(
.path = {"", "home", "me", "repo"},
},
},
// Already file: scheme
// NOTE: Git/SCP treat this as a `<hostname>:<path>`, so we are
// failing to "fix up" this case.
FixGitURLParam{
.input = "file:/var/repos/x",
.expected = "file:/var/repos/x",
.parsed =
ParsedURL{
.scheme = "file",
.authority = std::nullopt,
.path = {"", "var", "repos", "x"},
},
},
// IPV6 test case
FixGitURLParam{
.input = "user@[2001:db8:1::2]:/home/file",
.expected = "ssh://user@[2001:db8:1::2]//home/file",
.expected = "ssh://user@[2001:db8:1::2]/home/file",
.parsed =
ParsedURL{
.scheme = "ssh",
@@ -100,7 +115,7 @@ INSTANTIATE_TEST_SUITE_P(
.host = "2001:db8:1::2",
.user = "user",
},
.path = {"", "", "home", "file"},
.path = {"", "home", "file"},
},
}));
@@ -112,19 +127,18 @@ TEST_P(FixGitURLTestSuite, parsesVariedGitUrls)
EXPECT_EQ(actual.to_string(), p.expected);
}
TEST(FixGitURLTestSuite, scpLikeNoUserParsesPoorly)
TEST_P(FixGitURLTestSuite, fixGitIsIdempotent)
{
// SCP-like URL (no user)
auto & p = GetParam();
const auto actual = fixGitURL(p.expected).to_string();
EXPECT_EQ(actual, p.expected);
}
// Cannot "to_string" this because has illegal path not starting
// with `/`.
EXPECT_EQ(
fixGitURL("github.com:owner/repo.git"),
(ParsedURL{
.scheme = "file",
.authority = ParsedURL::Authority{},
.path = {"github.com:owner", "repo.git"},
}));
TEST_P(FixGitURLTestSuite, fixGitOutputParses)
{
auto & p = GetParam();
const auto parsed = fixGitURL(p.expected);
EXPECT_EQ(parseURL(parsed.to_string()), parsed);
}
TEST(FixGitURLTestSuite, properlyRejectFileURLWithAuthority)
@@ -136,37 +150,15 @@ TEST(FixGitURLTestSuite, properlyRejectFileURLWithAuthority)
testing::HasSubstrIgnoreANSIMatcher("file:// URL 'file://var/repos/x' has unexpected authority 'var'")));
}
TEST(FixGitURLTestSuite, scpLikePathLeadingSlashParsesPoorly)
TEST(FixGitURLTestSuite, ambiguousScpLikeOrFileURL)
{
// SCP-like URL (no user)
// Cannot "to_string" this because has illegal path not starting
// with `/`.
EXPECT_EQ(
fixGitURL("github.com:/owner/repo.git"),
(ParsedURL{
.scheme = "file",
.authority = ParsedURL::Authority{},
.path = {"github.com:", "owner", "repo.git"},
}));
}
TEST(FixGitURLTestSuite, relativePathParsesPoorly)
{
// Relative path (becomes file:// absolute)
// Cannot "to_string" this because has illegal path not starting
// with `/`.
EXPECT_EQ(
fixGitURL("relative/repo"),
(ParsedURL{
.scheme = "file",
.authority =
ParsedURL::Authority{
.hostType = ParsedURL::Authority::HostType::Name,
.host = "",
},
.path = {"relative", "repo"}}));
/* Git/SCP treat this as a `<hostname>:<path>`, but under IETF RFC
8089 it is a valid (if sloppy) file URL. Rather than decide who
is right, we just make it an error. */
EXPECT_THAT(
[]() { fixGitURL("file:/var/repos/x"); },
::testing::ThrowsMessage<BadURL>(testing::HasSubstrIgnoreANSIMatcher(
"URL 'file:/var/repos/x' would parse as SCP authority = 'file', path = '/var/repos/x' but this is also a valid `file:..` URL, and so we choose to disallow it")));
}
TEST(parseURL, parsesSimpleHttpUrl)

View File

@@ -327,10 +327,23 @@ struct ParsedUrlScheme
ParsedUrlScheme parseUrlScheme(std::string_view scheme);
/* Detects scp-style uris (e.g. git@github.com:NixOS/nix) and fixes
them by removing the `:` and assuming a scheme of `ssh://`. Also
changes absolute paths into file:// URLs. */
ParsedURL fixGitURL(const std::string & url);
/**
* Normalize a Git remote string from various styles into a URL-like form.
* Input forms handled:
* 1) SCP-style SSH syntax: "[user@]host:path" -> "ssh://user@host/path"
* 2) Already "file:" URLs: "file:/abs/or/rel" -> unchanged
* 3) Bare paths / filenames: "src/repo" or "/abs" -> "file:src/repo" or "file:/abs"
* 4) Anything with "://": treated as a proper URL -> unchanged
*
* Note: for the scp-style, as they are converted to ssh-form, all paths are assumed to
* then be absolute whereas in programs like git, they retain the scp form which allows
* relative paths.
*
* Additionally, if no url can be determined, it is returned as a file:// URI.
* If the url does not start with a leading slash, one will be added since there are no
* relative path URIs.
*/
ParsedURL fixGitURL(std::string_view url);
/**
* Whether a string is valid as RFC 3986 scheme name.

View File

@@ -408,21 +408,91 @@ ParsedUrlScheme parseUrlScheme(std::string_view scheme)
};
}
ParsedURL fixGitURL(const std::string & url)
struct ScpLike
{
std::regex scpRegex("([^/]*)@(.*):(.*)");
if (!hasPrefix(url, "/") && std::regex_match(url, scpRegex))
return parseURL(std::regex_replace(url, scpRegex, "ssh://$1@$2/$3"));
if (hasPrefix(url, "file:"))
return parseURL(url);
if (url.find("://") == std::string::npos) {
ParsedURL::Authority authority;
std::string_view path;
};
/**
* Parse a scp url. This is a helper struct for fixGitURL.
* This is needed since we support scp-style urls for git urls.
* https://git-scm.com/book/ms/v2/Git-on-the-Server-The-Protocols
*
* A good reference is libgit2 also allows scp style
* https://github.com/libgit2/libgit2/blob/58d9363f02f1fa39e46d49b604f27008e75b72f2/src/util/net.c#L806
*/
static std::optional<ScpLike> parseScp(const std::string_view s) noexcept
{
if (s.empty() || s.front() == '/')
return std::nullopt;
// Find the colon that separates host from path.
// Find the right-most since ipv6 has colons
const auto colon = s.rfind(':');
if (colon == std::string_view::npos)
return std::nullopt;
// Split head:[path]
const auto head = s.substr(0, colon);
const auto path = s.substr(colon + 1);
if (head.empty())
return std::nullopt;
return ScpLike{
.authority = ParsedURL::Authority::parse(head),
.path = path,
};
}
ParsedURL fixGitURL(const std::string_view url)
{
{
std::optional<ParsedURL> parsedOpt;
try {
parsedOpt = parseURL(url);
} catch (BadURL &) {
if (hasPrefix(url, "file:"))
throw;
}
if (parsedOpt) {
auto & parsed = *parsedOpt;
if (parsed.authority)
return parsed;
if (parsed.scheme == "file")
throw BadURL(
"URL '%s' would parse as SCP authority = 'file', path = '%s' but this is also a valid `file:..` URL, and so we choose to disallow it",
url,
parsed.renderPath(true));
}
}
// if the url does not start with forward slash, add one
auto splitMakeAbs = [&](std::string_view pathS) {
std::vector<std::string> path;
if (!hasPrefix(pathS, "/")) {
path.emplace_back("");
}
splitStringInto(path, pathS, "/");
return path;
};
if (auto scp = parseScp(url)) {
return ParsedURL{
.scheme = "file",
.authority = ParsedURL::Authority{},
.path = splitString<std::vector<std::string>>(url, "/"),
.scheme = "ssh",
.authority = std::move(scp->authority),
.path = splitMakeAbs(scp->path),
};
}
return parseURL(url);
return ParsedURL{
.scheme = "file",
.authority = ParsedURL::Authority{},
.path = splitMakeAbs(url),
};
}
// https://www.rfc-editor.org/rfc/rfc3986#section-3.1