## Code search

We index our own openai-python code repository, and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files.

In [1]:
import os
from glob import glob
import pandas as pd

def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath).read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            yield {"code": code, "function_name": function_name, "filepath": filepath}


# get user root directory
root_dir = os.path.expanduser("~")

# path to code repository directory
code_root = root_dir + "/openai-python"
code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))
all_funcs = []
for code_file in code_files:
    funcs = list(get_functions(code_file))
    for func in funcs:
        all_funcs.append(func)

print("Total number of functions extracted:", len(all_funcs))


Total number of py files: 40
Total number of functions extracted: 64


For code search models we use code-search-{model}-code to obtain embeddings for code snippets, and code-search-{model}-text to embed natural language queries.

In [2]:
from openai.embeddings_utils import get_embedding

df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='code-search-babbage-code-001'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("output/code_search_openai-python.csv", index=False)
df.head()

Unnamed: 0,code,function_name,filepath,code_embedding
0,"def semantic_search(engine, query, documents):...",semantic_search,/examples/semanticsearch/semanticsearch.py,"[-0.038976121693849564, -0.0031428150832653046..."
1,def main():\n parser = argparse.ArgumentPar...,main,/examples/semanticsearch/semanticsearch.py,"[-0.024289356544613838, -0.017748363316059113,..."
2,"def get_candidates(\n prompt: str,\n sto...",get_candidates,/examples/codex/backtranslation.py,"[-0.04161201789975166, -0.0169310811907053, 0...."
3,"def rindex(lst: List, value: str) -> int:\n ...",rindex,/examples/codex/backtranslation.py,"[-0.027255680412054062, -0.007931121625006199,..."
4,def eval_candidate(\n candidate_answer: str...,eval_candidate,/examples/codex/backtranslation.py,"[-0.00999179296195507, -0.01640152558684349, 0..."


In [5]:
from openai.embeddings_utils import cosine_similarity

def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(code_query, engine='code-search-babbage-text-001')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)
    if pprint:
        for r in res.iterrows():
            print(r[1].filepath+":"+r[1].function_name + "  score=" + str(round(r[1].similarities, 3)))
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-'*70)
    return res
res = search_functions(df, 'Completions API tests', n=3)


/openai/tests/test_endpoints.py:test_completions_multiple_prompts  score=0.681
def test_completions_multiple_prompts():
    result = openai.Completion.create(
        prompt=["This was a test", "This was another test"], n=5, engine="ada"
    )
    assert len(result.choices) == 10

----------------------------------------------------------------------
/openai/tests/test_endpoints.py:test_completions  score=0.675
def test_completions():
    result = openai.Completion.create(prompt="This was a test", n=5, engine="ada")
    assert len(result.choices) == 5


----------------------------------------------------------------------
/openai/tests/test_api_requestor.py:test_requestor_sets_request_id  score=0.635
def test_requestor_sets_request_id(mocker: MockerFixture) -> None:
    # Fake out 'requests' and confirm that the X-Request-Id header is set.

    got_headers = {}

    def fake_request(self, *args, **kwargs):
        nonlocal got_headers
--------------------------------------------------

In [6]:
res = search_functions(df, 'fine-tuning input data validation logic', n=3)

/openai/validators.py:format_inferrer_validator  score=0.655
def format_inferrer_validator(df):
    """
    This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
    It will also suggest to use ada and explain train/validation split benefits.
    """
    ft_type = infer_task_type(df)
    immediate_msg = None
----------------------------------------------------------------------
/openai/validators.py:long_examples_validator  score=0.649
def long_examples_validator(df):
    """
    This validator will suggest to the user to remove examples that are too long.
    """
    immediate_msg = None
    optional_msg = None
    optional_fn = None
----------------------------------------------------------------------
/openai/validators.py:non_empty_completion_validator  score=0.646
def non_empty_completion_validator(df):
    """
    This validator will ensure that no completion is empty.
    """
    necessary_msg = None
    necessar

In [7]:
res = search_functions(df, 'find common suffix', n=2, n_lines=10)

/openai/validators.py:common_completion_suffix_validator  score=0.665
def common_completion_suffix_validator(df):
    """
    This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
    """
    error_msg = None
    immediate_msg = None
    optional_msg = None
    optional_fn = None

    ft_type = infer_task_type(df)
----------------------------------------------------------------------
/openai/validators.py:get_outfnames  score=0.66
def get_outfnames(fname, split):
    suffixes = ["_train", "_valid"] if split else [""]
    i = 0
    while True:
        index_suffix = f" ({i})" if i > 0 else ""
        candidate_fnames = [
            fname.split(".")[0] + "_prepared" + suffix + index_suffix + ".jsonl"
            for suffix in suffixes
        ]
        if not any(os.path.isfile(f) for f in candidate_fnames):
----------------------------------------------------------------------


In [8]:
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)

/openai/cli.py:tools_register  score=0.651
def tools_register(parser):
    subparsers = parser.add_subparsers(
        title="Tools", help="Convenience client side tools"
    )

    def help(args):
        parser.print_help()

    parser.set_defaults(func=help)

    sub = subparsers.add_parser("fine_tunes.prepare_data")
    sub.add_argument(
        "-f",
        "--file",
        required=True,
        help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed."
        "This should be the local file path.",
    )
    sub.add_argument(
        "-q",
----------------------------------------------------------------------
