master
  1package gitdiff
  2
  3import (
  4	"bufio"
  5	"errors"
  6	"fmt"
  7	"io"
  8	"io/ioutil"
  9	"mime/quotedprintable"
 10	"net/mail"
 11	"strconv"
 12	"strings"
 13	"time"
 14	"unicode"
 15)
 16
 17const (
 18	mailHeaderPrefix        = "From "
 19	prettyHeaderPrefix      = "commit "
 20	mailMinimumHeaderPrefix = "From:"
 21)
 22
 23// PatchHeader is a parsed version of the preamble content that appears before
 24// the first diff in a patch. It includes metadata about the patch, such as the
 25// author and a subject.
 26type PatchHeader struct {
 27	// The SHA of the commit the patch was generated from. Empty if the SHA is
 28	// not included in the header.
 29	SHA string
 30
 31	// The author details of the patch. If these details are not included in
 32	// the header, Author is nil and AuthorDate is the zero time.
 33	Author     *PatchIdentity
 34	AuthorDate time.Time
 35
 36	// The committer details of the patch. If these details are not included in
 37	// the header, Committer is nil and CommitterDate is the zero time.
 38	Committer     *PatchIdentity
 39	CommitterDate time.Time
 40
 41	// The title and body of the commit message describing the changes in the
 42	// patch. Empty if no message is included in the header.
 43	Title string
 44	Body  string
 45
 46	// If the preamble looks like an email, ParsePatchHeader will
 47	// remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the
 48	// Title and place them here.
 49	SubjectPrefix string
 50
 51	// If the preamble looks like an email, and it contains a `---`
 52	// line, that line will be removed and everything after it will be
 53	// placed in BodyAppendix.
 54	BodyAppendix string
 55}
 56
 57// Message returns the commit message for the header. The message consists of
 58// the title and the body separated by an empty line.
 59func (h *PatchHeader) Message() string {
 60	var msg strings.Builder
 61	if h != nil {
 62		msg.WriteString(h.Title)
 63		if h.Body != "" {
 64			msg.WriteString("\n\n")
 65			msg.WriteString(h.Body)
 66		}
 67	}
 68	return msg.String()
 69}
 70
 71// ParsePatchDate parses a patch date string. It returns the parsed time or an
 72// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
 73// short, raw, unix, and default formats (with local variants) used by the
 74// --date flag in Git.
 75func ParsePatchDate(s string) (time.Time, error) {
 76	const (
 77		isoFormat          = "2006-01-02 15:04:05 -0700"
 78		isoStrictFormat    = "2006-01-02T15:04:05-07:00"
 79		rfc2822Format      = "Mon, 2 Jan 2006 15:04:05 -0700"
 80		shortFormat        = "2006-01-02"
 81		defaultFormat      = "Mon Jan 2 15:04:05 2006 -0700"
 82		defaultLocalFormat = "Mon Jan 2 15:04:05 2006"
 83	)
 84
 85	if s == "" {
 86		return time.Time{}, nil
 87	}
 88
 89	for _, fmt := range []string{
 90		isoFormat,
 91		isoStrictFormat,
 92		rfc2822Format,
 93		shortFormat,
 94		defaultFormat,
 95		defaultLocalFormat,
 96	} {
 97		if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil {
 98			return t, nil
 99		}
100	}
101
102	// unix format
103	if unix, err := strconv.ParseInt(s, 10, 64); err == nil {
104		return time.Unix(unix, 0), nil
105	}
106
107	// raw format
108	if space := strings.IndexByte(s, ' '); space > 0 {
109		unix, uerr := strconv.ParseInt(s[:space], 10, 64)
110		zone, zerr := time.Parse("-0700", s[space+1:])
111		if uerr == nil && zerr == nil {
112			return time.Unix(unix, 0).In(zone.Location()), nil
113		}
114	}
115
116	return time.Time{}, fmt.Errorf("unknown date format: %s", s)
117}
118
119// A PatchHeaderOption modifies the behavior of ParsePatchHeader.
120type PatchHeaderOption func(*patchHeaderOptions)
121
122// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when
123// parsing mail-formatted patches.
124type SubjectCleanMode int
125
126const (
127	// SubjectCleanWhitespace removes leading and trailing whitespace.
128	SubjectCleanWhitespace SubjectCleanMode = iota
129
130	// SubjectCleanAll removes leading and trailing whitespace, leading "Re:",
131	// "re:", and ":" strings, and leading strings enclosed by '[' and ']'.
132	// This is the default behavior of git (see `git mailinfo`) and this
133	// package.
134	SubjectCleanAll
135
136	// SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes
137	// leading strings enclosed by '[' and ']' if they start with "PATCH".
138	SubjectCleanPatchOnly
139)
140
141// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By
142// default, uses SubjectCleanAll.
143func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption {
144	return func(opts *patchHeaderOptions) {
145		opts.subjectCleanMode = m
146	}
147}
148
149type patchHeaderOptions struct {
150	subjectCleanMode SubjectCleanMode
151}
152
153// ParsePatchHeader parses the preamble string returned by [Parse] into a
154// PatchHeader. Due to the variety of header formats, some fields of the parsed
155// PatchHeader may be unset after parsing.
156//
157// Supported formats are the short, medium, full, fuller, and email pretty
158// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox
159// format used by `git format-patch`.
160//
161// When parsing mail-formatted headers, ParsePatchHeader tries to remove
162// email-specific content from the title and body:
163//
164//   - Based on the SubjectCleanMode, remove prefixes like reply markers and
165//     "[PATCH]" strings from the subject, saving any removed content in the
166//     SubjectPrefix field. Parsing always discards leading and trailing
167//     whitespace from the subject line. The default mode is SubjectCleanAll.
168//
169//   - If the body contains a "---" line (3 hyphens), remove that line and any
170//     content after it from the body and save it in the BodyAppendix field.
171//
172// ParsePatchHeader tries to process content it does not understand wthout
173// returning errors, but will return errors if well-identified content like
174// dates or identies uses unknown or invalid formats.
175func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) {
176	opts := patchHeaderOptions{
177		subjectCleanMode: SubjectCleanAll, // match git defaults
178	}
179	for _, optFn := range options {
180		optFn(&opts)
181	}
182
183	header = strings.TrimSpace(header)
184	if header == "" {
185		return &PatchHeader{}, nil
186	}
187
188	var firstLine, rest string
189	if idx := strings.IndexByte(header, '\n'); idx >= 0 {
190		firstLine = header[:idx]
191		rest = header[idx+1:]
192	} else {
193		firstLine = header
194		rest = ""
195	}
196
197	switch {
198	case strings.HasPrefix(firstLine, mailHeaderPrefix):
199		return parseHeaderMail(firstLine, strings.NewReader(rest), opts)
200
201	case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix):
202		// With a minimum header, the first line is part of the actual mail
203		// content and needs to be parsed as part of the "rest"
204		return parseHeaderMail("", strings.NewReader(header), opts)
205
206	case strings.HasPrefix(firstLine, prettyHeaderPrefix):
207		return parseHeaderPretty(firstLine, strings.NewReader(rest))
208	}
209
210	return nil, errors.New("unrecognized patch header format")
211}
212
213func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) {
214	const (
215		authorPrefix     = "Author:"
216		commitPrefix     = "Commit:"
217		datePrefix       = "Date:"
218		authorDatePrefix = "AuthorDate:"
219		commitDatePrefix = "CommitDate:"
220	)
221
222	h := &PatchHeader{}
223
224	prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix)
225	if i := strings.IndexByte(prettyLine, ' '); i > 0 {
226		h.SHA = prettyLine[:i]
227	} else {
228		h.SHA = prettyLine
229	}
230
231	s := bufio.NewScanner(r)
232	for s.Scan() {
233		line := s.Text()
234
235		// empty line marks end of fields, remaining lines are title/message
236		if strings.TrimSpace(line) == "" {
237			break
238		}
239
240		switch {
241		case strings.HasPrefix(line, authorPrefix):
242			u, err := ParsePatchIdentity(line[len(authorPrefix):])
243			if err != nil {
244				return nil, err
245			}
246			h.Author = &u
247
248		case strings.HasPrefix(line, commitPrefix):
249			u, err := ParsePatchIdentity(line[len(commitPrefix):])
250			if err != nil {
251				return nil, err
252			}
253			h.Committer = &u
254
255		case strings.HasPrefix(line, datePrefix):
256			d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):]))
257			if err != nil {
258				return nil, err
259			}
260			h.AuthorDate = d
261
262		case strings.HasPrefix(line, authorDatePrefix):
263			d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):]))
264			if err != nil {
265				return nil, err
266			}
267			h.AuthorDate = d
268
269		case strings.HasPrefix(line, commitDatePrefix):
270			d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):]))
271			if err != nil {
272				return nil, err
273			}
274			h.CommitterDate = d
275		}
276	}
277	if s.Err() != nil {
278		return nil, s.Err()
279	}
280
281	title, indent := scanMessageTitle(s)
282	if s.Err() != nil {
283		return nil, s.Err()
284	}
285	h.Title = title
286
287	if title != "" {
288		// Don't check for an appendix, pretty headers do not contain them
289		body, _ := scanMessageBody(s, indent, false)
290		if s.Err() != nil {
291			return nil, s.Err()
292		}
293		h.Body = body
294	}
295
296	return h, nil
297}
298
299func scanMessageTitle(s *bufio.Scanner) (title string, indent string) {
300	var b strings.Builder
301	for i := 0; s.Scan(); i++ {
302		line := s.Text()
303		trimLine := strings.TrimSpace(line)
304		if trimLine == "" {
305			break
306		}
307
308		if i == 0 {
309			if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 {
310				indent = line[:start]
311			}
312		}
313		if b.Len() > 0 {
314			b.WriteByte(' ')
315		}
316		b.WriteString(trimLine)
317	}
318	return b.String(), indent
319}
320
321func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) {
322	// Body and appendix
323	var body, appendix strings.Builder
324	c := &body
325	var empty int
326	for i := 0; s.Scan(); i++ {
327		line := s.Text()
328
329		line = strings.TrimRightFunc(line, unicode.IsSpace)
330		line = strings.TrimPrefix(line, indent)
331
332		if line == "" {
333			empty++
334			continue
335		}
336
337		// If requested, parse out "appendix" information (often added
338		// by `git format-patch` and removed by `git am`).
339		if separateAppendix && c == &body && line == "---" {
340			c = &appendix
341			continue
342		}
343
344		if c.Len() > 0 {
345			c.WriteByte('\n')
346			if empty > 0 {
347				c.WriteByte('\n')
348			}
349		}
350		empty = 0
351
352		c.WriteString(line)
353	}
354	return body.String(), appendix.String()
355}
356
357func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) {
358	msg, err := mail.ReadMessage(r)
359	if err != nil {
360		return nil, err
361	}
362
363	h := &PatchHeader{}
364
365	if strings.HasPrefix(mailLine, mailHeaderPrefix) {
366		mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix)
367		if i := strings.IndexByte(mailLine, ' '); i > 0 {
368			h.SHA = mailLine[:i]
369		}
370	}
371
372	from := msg.Header.Get("From")
373	if from != "" {
374		u, err := ParsePatchIdentity(from)
375		if err != nil {
376			return nil, err
377		}
378		h.Author = &u
379	}
380
381	date := msg.Header.Get("Date")
382	if date != "" {
383		d, err := ParsePatchDate(date)
384		if err != nil {
385			return nil, err
386		}
387		h.AuthorDate = d
388	}
389
390	subject := msg.Header.Get("Subject")
391	h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode)
392
393	s := bufio.NewScanner(msg.Body)
394	h.Body, h.BodyAppendix = scanMessageBody(s, "", true)
395	if s.Err() != nil {
396		return nil, s.Err()
397	}
398
399	return h, nil
400}
401
402func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) {
403	switch mode {
404	case SubjectCleanAll, SubjectCleanPatchOnly:
405	case SubjectCleanWhitespace:
406		return "", strings.TrimSpace(decodeSubject(s))
407	default:
408		panic(fmt.Sprintf("unknown clean mode: %d", mode))
409	}
410
411	// Based on the algorithm from Git in mailinfo.c:cleanup_subject()
412	// If compatibility with `git am` drifts, go there to see if there are any updates.
413
414	at := 0
415	for at < len(s) {
416		switch s[at] {
417		case 'r', 'R':
418			// Detect re:, Re:, rE: and RE:
419			if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' {
420				at += 3
421				continue
422			}
423
424		case ' ', '\t', ':':
425			// Delete whitespace and duplicate ':' characters
426			at++
427			continue
428
429		case '[':
430			if i := strings.IndexByte(s[at:], ']'); i > 0 {
431				if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") {
432					at += i + 1
433					continue
434				}
435			}
436		}
437
438		// Nothing was removed, end processing
439		break
440	}
441
442	prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace)
443	subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace)
444	return
445}
446
447// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result
448// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji).
449// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject
450func decodeSubject(encoded string) string {
451	if !strings.HasPrefix(encoded, "=?UTF-8?q?") {
452		// not UTF-8 encoded
453		return encoded
454	}
455
456	// If the subject is too long, `git format-patch` may produce a subject line across
457	// multiple lines. When parsed, this can look like the following:
458	// <UTF8-prefix><first-line> <UTF8-prefix><second-line>
459	payload := " " + encoded
460	payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "")
461	payload = strings.ReplaceAll(payload, "?=", "")
462
463	decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload)))
464	if err != nil {
465		// if err, abort decoding and return original subject
466		return encoded
467	}
468
469	return string(decoded)
470}