program modifiedutf8tests;

{$mode objfpc}{$H+}

uses
  {$IFDEF UNIX}{$IFDEF UseCThreads}
  cthreads,
  {$ENDIF}{$ENDIF}
  Classes , sysutils, strutils, bbjniutils, LCLProc, bbutils
  { you can add units after this };

procedure test(i,j: integer);
begin
  if i <> j then raise Exception.Create('test failed: '+inttostr(i)+ ' <> '+inttostr(j));
end;

procedure dump(s: string);
var
  i: Integer;
begin
  write(length(s), ' ');
  for i := 1 to length(s) do write(inttohex(ord(s[i]), 2));
  writeln;
end;

procedure test(s, t: string);
begin
  if s <> t then begin
    dump(s);
    dump(t);
    raise Exception.Create('test failed: '+s+ ' <> '+t);
  end;
end;

function isValidUTF8(s: string; debug: boolean): boolean;
var
  c, cl: integer;
  i: Integer;
  t: String;

begin
  t := StringReplace(s, #$C0#$80, #0#0, [rfReplaceAll]);
  if FindInvalidUTF8Character(@t[1], length(t)) <> -1 then exit(false); //lcl function to check for validation. Does not work in a few cases through.
  i := 1;
  t := '';
  while i <= length(s) do begin
    c := UTF8CharacterToUnicode(@s[i], cl);
    t += strGetUnicodeCharacter(c);
    i += cl;
    if debug then write(c, '  ');
  end;
  result := s = t;
  if not result and ((pos(#0, t) > 0) or (pos(#0, s) > 0)) then
    result := StringReplace(s, #0, #$C0#$80, [rfReplaceAll]) = StringReplace(t, #0, #$C0#$80, [rfReplaceAll]); //allow both since modified utf-8
  if  debug then begin writeln; dump(t);end;


end;

var
  l: Integer;
  i,j: Integer;
  s:string;
  mycheck: Integer;
  refcheck: Boolean;
  scm: TStringConversionMode;
begin
  test(isValidModifiedUTF8('', scmConvertValidUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8(#0, scmConvertValidUTF8ToMUTF8), 2);
  test(isValidModifiedUTF8(#$C0#$80, scmConvertValidUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8(#$C1#$80, scmConvertValidUTF8ToMUTF8), 1);
  test(isValidModifiedUTF8('abcd', scmConvertValidUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8('abäöüß', scmConvertValidUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8('abäöü€'#$E3#$80#$8F, scmConvertValidUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8('abäöü€'#$13#$80#$8F, scmConvertValidUTF8ToMUTF8), 1);
  test(isValidModifiedUTF8(#$F0#$90#$8c#$8c, scmConvertValidUTF8ToMUTF8), 2);
  test(isValidModifiedUTF8(#$F0#$90#$7c#$8c, scmConvertValidUTF8ToMUTF8), 1);
  test(isValidModifiedUTF8(#0#$F0#$90#$7c#$8c, scmConvertValidUTF8ToMUTF8), 1);

  test(isValidModifiedUTF8('', scmConvertAndRepairUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8(#0, scmConvertAndRepairUTF8ToMUTF8), 2);
  test(isValidModifiedUTF8(#$C0#$80, scmConvertValidUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8(#$C1#$80, scmConvertValidUTF8ToMUTF8), 1);
  test(isValidModifiedUTF8('abcd', scmConvertAndRepairUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8('abäöüß', scmConvertAndRepairUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8('abäöü€'#$E3#$80#$8F, scmConvertAndRepairUTF8ToMUTF8), 0);
  test(isValidModifiedUTF8('abäöü€'#$13#$80#$8F, scmConvertAndRepairUTF8ToMUTF8), 1);
  test(isValidModifiedUTF8(#$F0#$90#$8c#$8c, scmConvertAndRepairUTF8ToMUTF8), 2);
  test(isValidModifiedUTF8(#$F0#$90#$7c#$8c, scmConvertAndRepairUTF8ToMUTF8), 1);
  test(isValidModifiedUTF8(#0#$F0#$90#$7c#$8c, scmConvertAndRepairUTF8ToMUTF8), 2);

  for scm := scmConvertValidUTF8ToMUTF8 to scmConvertAndRepairUTF8ToMUTF8 do begin
    test(isValidModifiedUTF8(#$ED#$A0#$81#$ED#$B0#$80, scm), 0);
    test(isValidModifiedUTF8(#$C0#$80#$C0#$80#$4D#$61#$ED#$AE#$80#$ED#$B0#$80#$C0#$80, scm), 0);
  end;

  s := #140;
  writeln(FindInvalidUTF8Character(pchar(s),1));

  //fuzzing
  Randomize;
  for l := 1 to 12 do begin
    setlength(s, l);
    for i := 1 to 100000 do begin
      for j := 1 to  l do
        s[j] := chr(Random(256));
      mycheck := isValidModifiedUTF8(s, scmConvertValidUTF8ToMUTF8);
      refcheck := isValidUTF8(s, false);
      if (mycheck <> 1) <> refcheck then begin
       // writeln(l, ' ',i);
        dump(s);
        isValidUTF8(s, true);
        isValidModifiedUTF8(s, scmConvertValidUTF8ToMUTF8);

        raise Exception.Create(IntToStr(mycheck)  + ' <> !'+BoolToStr(refcheck));
      end;
    end;
  end;

  test(repairModifiedUTF8(''), '');
  test(repairModifiedUTF8(#0), #$C0#$80);
  test(repairModifiedUTF8(#$80#$C0), '??');
  test(repairModifiedUTF8(#$C0), '?');
  test(repairModifiedUTF8(#$C1), '?');
  test(repairModifiedUTF8(#$C2), '?');
  test(repairModifiedUTF8(#$E0), '?');
  test(repairModifiedUTF8(#$E0#$80), '?');
  test(repairModifiedUTF8(#$F0), '?');
  test(repairModifiedUTF8(#$F0#$80), '?');
  test(repairModifiedUTF8(#$F0#$80#$80), '?');
  test(repairModifiedUTF8(#$C0#$80), #$C0#$80);
  test(repairModifiedUTF8(#$C0#$D1), '?');
  test(repairModifiedUTF8(#$C0#$A5), '%');
  test(repairModifiedUTF8(#$C1#$98), 'X');
  test(repairModifiedUTF8(#$C1#$A1), 'a');
  test(repairModifiedUTF8(#$D4#$80), #$D4#$80);
  test(repairModifiedUTF8('abcd'), 'abcd');
  test(repairModifiedUTF8('abäöüß'), 'abäöüß');
  test(repairModifiedUTF8('abäöü€'#$E3#$80#$8F), 'abäöü€'#$E3#$80#$8F);
  test(repairModifiedUTF8('abäöü€'#$13#$80#$8F), 'abäöü€'#$13'??');
//not sure what it should be  test(repairModifiedUTF8(#$F0#$90#$8c#$8c), 2);
  test(repairModifiedUTF8(#$F0#$90#$7c#$8c), '?');
  test(repairModifiedUTF8(#0#$F0#$90#$7c#$8c), #$C0#$80'?');
  test(repairModifiedUTF8(#$F0#$90#$90#$80), #$ED#$A0#$81#$ED#$B0#$80); //example from wikipedia, 10400 => pairs: D801, DC00
  test(repairModifiedUTF8(#$4D#$61#$F3#$B0#$80#$80), #$4D#$61#$ED#$AE#$80#$ED#$B0#$80); //example from tr26
  test(repairModifiedUTF8(#0#0#$4D#$61#$F3#$B0#$80#$80#0), #$C0#$80#$C0#$80#$4D#$61#$ED#$AE#$80#$ED#$B0#$80#$C0#$80);
  test(repairModifiedUTF8(#$D4#$80), #$D4#$80);
  test(repairModifiedUTF8(#$E0#$80#$80), #$C0#$80);







  writeln('ok');

end.

