腾讯云TI-ONE训练平台提交训练任务_AI解决方案_同尘科技

TI-ONE 训练平台 2年前 (2023-08-02) 浏览 72
describe_postpaid_training_price(self)    查询每种配置的每小时的价格,价格单位:元        :rtype:     tencentcloud.tione.v20211111.models.DescribeBillingSpecsResponse
describe_system_reasoning_images(self) 获取平台内置的推理镜像
:return: 推理镜像信息 :rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeInferTemplatesResponse` 数据格式如: { "FrameworkTemplates": [ { "Framework": "TENSORFLOW", "FrameworkVersion": "2.4", "Groups": [ "TENSORFLOW", "LIGHT" ], "InferTemplates": [ { "InferTemplateId": "tf2.4-py38-cpu", "InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tensorflow2.4-cpu-20211206" }, { "InferTemplateId": "tf2.4-py38-gpu", "InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-gpu:py38-tensorflow2.4-cu110-20211206" } ] } ], "RequestId": "3654e19b-c2ba-4953-b131-d66495723008" }
print的返回结果,输出如下:(镜像标识用来配置新模型的运行环境 +------------+----------------+---------------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ | 算法框架 | 算法框架版本号 | 支持的训练框架集合 | 镜像标识 | 镜像url | +------------+----------------+---------------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+ | TENSORFLOW | 2.4 | ['TENSORFLOW', 'LIGHT'] | tf2.4-py38-cpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tf2.4-cpu-1.0.0 | | TENSORFLOW | 2.4 | ['TENSORFLOW', 'LIGHT'] | tf2.4-py38-gpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tf2.4-cu11.0-1.0.0 | | TENSORFLOW | 1.15 | ['TENSORFLOW', 'LIGHT', 'TI_ACC'] | tf1.15-py37-cpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py37-tf1.15-cpu-1.0.0 | | TENSORFLOW | 1.15 | ['TENSORFLOW', 'LIGHT', 'TI_ACC'] | tf1.15-py37-gpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py37-tf1.15-cu10.0-1.0.0 | | PYTORCH | 1.9 | ['PYTORCH', 'LIGHT', 'TI_ACC', 'AUTOML_CV'] | torch1.9.0-py38-cpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pytorch-cpu:py38-torch1.9.0-cpu-1.0.0 | | PYTORCH | 1.9 | ['PYTORCH', 'LIGHT', 'TI_ACC', 'AUTOML_CV'] | torch1.9.0-py38-cu111 | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pytorch-gpu:py38-torch1.9.0-cu111-1.0.0 | | PMML | 0.9.12 | ['SPARK', 'PYSPARK'] | pypmml-py38 | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pmml:py38-pmml0.9.16-1.0.0 | | OCR | 112 | ['AUTOML_OCR'] | ocr-serving | ccr.ccs.tencentyun.com/qcloud-ti-platform/ocr_serving_gpu:serving-v121-335584d | | DETECTRON2 | 112 | ['DETECTRON2'] | detectron2-torch1.9.0-py38-cu111 | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pytorch-gpu:py38-torch1.9.0-detectron2-cu111-1.0.0 | +------------+----------------+---------------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+
def create_training_task(self, name, framework, cos_output, worker_resource, code_package_path, ps_resource=None, input_data_config=None, worker_start_cmd=None, ps_start_cmd=None, tags=None, tuning_parameters_dict={}, resource_group_id="", remark=None, log_enable=False, log_logset_id=None, log_topic_id=None, vpc_id=None, sub_net_id=None): """创建训练任务
:param name: 任务名称 :type name: str :param framework: 运行的框架环境 :type framework: :class:`tikit.models.FrameworkInfo` :param cos_output: 输出的cos信息 :type cos_output: str :param worker_resource: worker节点的配置 :type worker_resource: :class:`tikit.models.ResourceConfigInfo` :param code_package_path: 代码的cos信息 :type code_package_path: str :param ps_resource: ps节点的配置 :type ps_resource: :class:`tikit.models.ResourceConfigInfo` :param input_data_config: 输入的数据信息 :type input_data_config: list or :class:`tikit.models.TrainingDataConfig` :param worker_start_cmd: worker的启动命令 :type worker_start_cmd: str :param ps_start_cmd: ps节点的启动命令 :type ps_start_cmd: str :param tags: 标签 :type tags: list of :class:`tikit.tencentcloud.tione.v20211111.models.Tag` :param tuning_parameters_dict: 调参字典 :type tuning_parameters_dict: dict :param resource_group_id: 预付费的资源组id :type resource_group_id: str :param remark: 描述 :type remark: str :param log_enable: 日志开关 :type log_enable: bool :param log_logset_id: 日志集id :type log_logset_id: str :param log_topic_id: 日志的topic id :type log_topic_id: str :param vpc_id: vpc的id :type vpc_id: str :param sub_net_id: 子网id :type sub_net_id: str :return: :rtype: :class:`tikit.tencentcloud.tione.v20211111.models.CreateTrainingTaskResponse` """
def describe_training_frameworks(self): """查看训练框架
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingFrameworksResponse` print返回的结果,输出如下: +------------+----------------------------------------+-------------------------+ | 框架名称 | 版本 | 训练模式 | +------------+----------------------------------------+-------------------------+ | TENSORFLOW | ti-acc1.0-tf1.15-py3.6-cuda10.0-gpu | PS_WORKER | | TENSORFLOW | light3.1.3-tf2.4-py3.8-cuda11.1-gpu | MPI, HOROVOD | | TENSORFLOW | tf1.15-py3.7-cpu | PS_WORKER, MPI, HOROVOD | | TENSORFLOW | tf1.15-py3.7-cuda10.0-gpu | PS_WORKER, MPI, HOROVOD | | TENSORFLOW | tf2.4-py3.8-cpu | PS_WORKER, MPI, HOROVOD | | TENSORFLOW | tf2.4-py3.8-cuda11.1-gpu | PS_WORKER, MPI, HOROVOD | | PYTORCH | ti-acc1.0-torch1.9-py3.8-cuda11.1-gpu | DDP | | PYTORCH | light3.1.3-torch1.9-py3.8-cuda11.1-gpu | DDP, MPI, HOROVOD | | PYTORCH | torch1.9-py3.8-cuda11.1-gpu | DDP, MPI, HOROVOD | | SPARK | spark2.4.5-cpu | SPARK | | PYSPARK | spark2.4.5-py3.6-cpu | SPARK | +------------+----------------------------------------+-------------------------+ """
def describe_training_tasks(self, filters=None, tag_filters=None, offset=0, limit=50, order="DESC", order_field="UpdateTime"): """获取训练任务列表
:param filters: 过滤器,eg:[{ "Name": "TaskStatus", "Values": ["Running"] }] :type filters: list of Filter :param tag_filters: 标签过滤器,eg:[{ "TagKey": "TagKeyA", "TagValue": ["TagValueA"] }] :type tag_filters: list of TagFilter :param offset: 偏移量,默认为0 :type offset: int :param limit: 返回数量,默认为50 :type limit: int :param order: 输出列表的排列顺序。取值范围:ASC:升序排列 DESC:降序排列 :type order: str :param order_field: 排序的依据字段, 取值范围 "CreateTime" "UpdateTime" :type order_field: str :return: :rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingTasksResponse` """
describe_training_task(self, task_id) 获取单个训练任务信息 :param task_id: 训练任务ID :type task_id: str :rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskResponse`
describe_training_task_pods(self, task_id) 获取训练任务的pod列表 :param task_id: 训练任务ID :type task_id: str :rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskPodsResponse`
def describe_train_logs(self, pod_name, start_time=None, end_time=None, limit=None, order=None, context=None, filters=None): """查看训练任务的日志
:param pod_name: 查询哪个Pod的日志,支持通配符。查看某个训练任务的全部pod的日志可以填: "*",如:train-51cd6bf7ec1000* :type pod_name: str :param start_time: 日志查询开始时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间的前一个小时 :type start_time: str :param end_time: 日志查询结束时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间 :type end_time: str :param limit: 日志查询条数,默认值100,最大值100 :type limit: int :param order: 排序方向。(ASC | DESC) 默认值为DESC :type order: str :param context: 分页的游标 :type context: str :param filters: 过滤Filters :type filters: list of tikit.tencentcloud.tione.v20211111.models.Filter :rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeLogsResponse`
返回的对象如果非空,就会有 next() 方法,能不断地获取下一页的日志(如果有多页的话),如下: now_time = datetime.datetime.now(datetime.timezone.utc) now_time_str = now_time.isoformat() result = client.describe_train_logs("train-51cd6bf7ec1000-37c5p5nlr01s-launcher", "2021-12-10T09:32:03.823509+00:00", now_time_str, limit=30) print(result) print(result.next()) print(result.next()) print(result.next()) """
def push_training_metrics(self, timestamp, value_map, task_id=None, epoch=None, total_steps=None, step=None): """上报训练自定义指标(单条)。单个子账号每秒可以调用20次,请在您的训练代码中注意控制上报频率,避免超限报错。或者使用push_training_metrics_list
:param timestamp: 时间戳 :type timestamp: int :param value_map: 指标映射。 指标名称 -> 指标值 :type value_map: map: str -> float :param task_id: 任务ID。若为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值 :type task_id: str :param epoch: epoch值 :type epoch: int :param total_steps: 总步数 :type total_steps: int :param step: 第几步 :type step: int :return: :rtype: :class:`tikit.tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`
client.push_training_metrics(int(time.time()), {"field1": 11, "field2": 12}, "task-id-00001", 3, 1000, 66) """
push_training_metrics_list(self, metric_list) 上报训练自定义指标(列表) :param metric_list: MetricData 数组。 若任务ID为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值 :type metric_list: list of :class:`tencentcloud.tione.v20211111.models.MetricData` :return: :rtype: :class:`tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`
describe_training_metrics(self, task_id) 查询训练自定义指标 :param task_id: 任务ID :type task_id: str :rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingMetricsResponse`
stop_training_task(self, task_id) 停止某个训练任务 :param task_id: 训练任务ID :type task_id: str :rtype: :class:`tencentcloud.tione.v20211111.models.StopTrainingTaskResponse`
delete_training_task(self, task_id) 删除某个训练任务 :param task_id: 训练任务ID :type task_id: str :rtype: :class:`tencentcloud.tione.v20211111.models.DeleteTrainingTaskResponse`
def describe_train_resource_groups(self, offset=0, limit=20, search_word="", tag_filters=None): """获取训练资源组列表
:param offset: 偏移量,默认为0;分页查询起始位置,如:Limit为100,第一页Offset为0,第二页OffSet为100....即每页左边为开区间 :type offset: int :param limit: 返回数量,默认为20,最大值为30;分页查询每页大小,最大30 :type limit: int :param search_word: 支持模糊查找资源组id和资源组名 :type search_word: str :param tag_filters: 标签过滤 :type tag_filters: list of tikit.tencentcloud.tione.v20211111.models.Tag :return: :rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeBillingResourceGroupsResponse` """

中间配置:

tikit.models.FrameworkInfo
def new_custom(training_mode, image_type, image_url, registry_region=None, registry_id=None): """自定义训练框架的配置
:param training_mode: 训练模式。 通过describe_training_frameworks()查看列表 :type training_mode: str :param image_type: 腾讯云容器镜像服务的镜像类型,如"CCR" :type image_type: str :param image_url: 腾讯云容器镜像服务的镜像地址 :type image_url: str :param registry_region: 腾讯云容器镜像服务的镜像仓库的域 :type registry_region: str :param registry_id: 腾讯云容器镜像服务的镜像仓库ID :type registry_id: str :return: :rtype: """
def new_custom_image(image_type, image_url, registry_region=None, registry_id=None): """自定义镜像的配置
:param image_type: 腾讯云容器镜像服务的镜像类型,如"CCR" :type image_type: str :param image_url: 腾讯云容器镜像服务的镜像地址 :type image_url: str :param registry_region: 腾讯云容器镜像服务的镜像仓库的域 :type registry_region: str :param registry_id: 腾讯云容器镜像服务的镜像仓库ID :type registry_id: str :return: :rtype: """
def new_system_framework(framework_name, framework_environment, training_mode): """系统内置的训练框架
:param framework_name: 框架名称。 通过describe_training_frameworks()查看列表 :type framework_name: str :param framework_environment: 框架环境。 通过describe_training_frameworks()查看列表 :type framework_environment: str :param training_mode: 训练模式。 通过describe_training_frameworks()查看列表 :type training_mode: str :return: :rtype: """
----------------------------------------------------tikit.models.ResourceConfigInfo
def new_postpaid(instance_type, instance_num): """获取后付费模式下的资源配置
:param instance_type: 实例类型。通过 describe_postpaid_training_price() 查看实例列表 :type instance_type: str :param instance_num: 实例数量 :type instance_num: int :return: :rtype: """| def new_prepaid(cpu, memory, gpu=0, gpu_type=None, instance_num=1): """获取预付费模式下的资源配置
:param cpu: CPU个数,单位是核 :type cpu: float :param memory: 内存大小,单位是GB :type memory: float :param gpu_type: gpu类型 :type gpu_type: str :param gpu: gpu个数 :type gpu: float :param instance_num: 实例数量 :type instance_num: int :return: :rtype: """
----------------------------------------------------tikit.models.TrainingDataConfig
def new_mount_cos(cos_str, target_path): """一个cos类型的训练数据
:param cos_str: cos存储,格式: // :type cos_str: str :param target_path: 目标挂载路径 :type target_path: str :return: :rtype: """
def new_dataset_mount(dataset_id, target_path): """一个dataset类型的训练数据
:param dataset_id: 数据集ID :type dataset_id: str :param target_path: 目标挂载路径 :type target_path: str :return: :rtype: """
def new_mount_cfs(cfs_id, source_path, target_path): """新建一个cfs类型的训练数据集配置
:param cfs_id: CFS的ID :type cfs_id: str :param source_path: CFS的路径 :type source_path: str :param target_path: 目标挂载路径 :type target_path: str :return: :rtype: """
def new_mount_hdfs(hdfs_id, source_path, target_path): """新建一个hdfs类型的训练数据集配置
:param hdfs_id: EMR上HDFS的ID :type hdfs_id: str :param source_path: HDFS的路径 :type source_path: str :param target_path: 目标挂载路径 :type target_path: str :return: :rtype: """
def new_mount_wedata_hdfs(wedata_id, source_path): """新建一个wedata hdfs类型的训练数据集配置
:param wedata_id: wedata数据源id :type wedata_id: int :param source_path: HDFS的路径 :type source_path: str :return: :rtype: """
def new_dataset(id_target_dict): """ Deprecated ! 新建一个dataset类型的训练数据集配置
:param id_target_dict: 数据集信息。 dataset id -> 下载的目标路径 :type id_target_dict: dict :return: :rtype: """
def new_cos_data(cos_str_target_dict): """Deprecated ! 新建一个cos类型的训练数据集配置
:param cos_str_target_dict: 数据集信息。 // -> 下载的目标路径 :type cos_str_target_dict: dict :return: :rtype: """




对解决方案有疑惑?想了解解决方案收费? 联系解决方案专家

腾讯云限时活动1折起,即将结束: 马上收藏

同尘科技为腾讯云授权服务中心,购买腾讯云享受折上折,更有现金返利:同意关联,立享优惠

阿里云解决方案也看看?: 点击对比阿里云的解决方案

- 0人点赞 -

发表点评 (0条)

not found

暂无评论,你要说点什么吗?